diff --git a/404.html b/404.html index 6d685fd..b7bc0a7 100644 --- a/404.html +++ b/404.html @@ -314,8 +314,25 @@
Here’s the full code for the function and its documentation.
#' Praise a weekday
#'
diff --git a/dge-model-building-with-variancepartition.html b/dge-model-building-with-variancepartition.html
index 881bbc2..a0fee53 100644
--- a/dge-model-building-with-variancepartition.html
+++ b/dge-model-building-with-variancepartition.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
diff --git a/differential-gene-expression-analysis-overview.html b/differential-gene-expression-analysis-overview.html
index 12819cd..ce5aeaa 100644
--- a/differential-gene-expression-analysis-overview.html
+++ b/differential-gene-expression-analysis-overview.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
diff --git a/differential-gene-expression-analysis-with-limma-voom.html b/differential-gene-expression-analysis-with-limma-voom.html
index 1b2ad06..1c4e1bb 100644
--- a/differential-gene-expression-analysis-with-limma-voom.html
+++ b/differential-gene-expression-analysis-with-limma-voom.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
diff --git a/differential-gene-expression-exercise.html b/differential-gene-expression-exercise.html
index 7092ce3..e4e425a 100644
--- a/differential-gene-expression-exercise.html
+++ b/differential-gene-expression-exercise.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
diff --git a/final-r-session.html b/final-r-session.html
index 30bdcf3..904369b 100644
--- a/final-r-session.html
+++ b/final-r-session.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
@@ -396,7 +413,7 @@ Final R Sessioncurl::curl_version()
-#> $version
-#> [1] "7.81.0"
-#>
-#> $ssl_version
-#> [1] "OpenSSL/3.0.2"
-#>
-#> $libz_version
-#> [1] "1.2.11"
-#>
-#> $libssh_version
-#> [1] "libssh/0.9.6/openssl/zlib"
-#>
-#> $libidn_version
-#> [1] "2.3.2"
-#>
-#> $host
-#> [1] "x86_64-pc-linux-gnu"
-#>
-#> $protocols
-#> [1] "dict" "file" "ftp" "ftps" "gopher" "gophers" "http" "https" "imap" "imaps" "ldap"
-#> [12] "ldaps" "mqtt" "pop3" "pop3s" "rtmp" "rtsp" "scp" "sftp" "smb" "smbs" "smtp"
-#> [23] "smtps" "telnet" "tftp"
-#>
-#> $ipv6
-#> [1] TRUE
-#>
-#> $http2
-#> [1] TRUE
-#>
-#> $idn
-#> [1] TRUE
This interactive book was last updated at 2024-06-10 23:30:30.562128.
+curl::curl_version()
+#> $version
+#> [1] "7.81.0"
+#>
+#> $ssl_version
+#> [1] "OpenSSL/3.0.2"
+#>
+#> $libz_version
+#> [1] "1.2.11"
+#>
+#> $libssh_version
+#> [1] "libssh/0.9.6/openssl/zlib"
+#>
+#> $libidn_version
+#> [1] "2.3.2"
+#>
+#> $host
+#> [1] "x86_64-pc-linux-gnu"
+#>
+#> $protocols
+#> [1] "dict" "file" "ftp" "ftps" "gopher" "gophers" "http" "https" "imap" "imaps" "ldap"
+#> [12] "ldaps" "mqtt" "pop3" "pop3s" "rtmp" "rtsp" "scp" "sftp" "smb" "smbs" "smtp"
+#> [23] "smtps" "telnet" "tftp"
+#>
+#> $ipv6
+#> [1] TRUE
+#>
+#> $http2
+#> [1] TRUE
+#>
+#> $idn
+#> [1] TRUE
This interactive book was last updated at 2024-06-11 11:00:51.293221.
proc.time()
## user system elapsed
-## 13.821 1.070 14.742
+## 14.294 0.998 15.070
System curl
version:
## $version
@@ -846,7 +863,7 @@ R session information
License
diff --git a/interactive-summarizedexperiment-visualizations.html b/interactive-summarizedexperiment-visualizations.html
index d7643a1..5b54c08 100644
--- a/interactive-summarizedexperiment-visualizations.html
+++ b/interactive-summarizedexperiment-visualizations.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
@@ -668,7 +685,7 @@ 2.4.1 Setting up the data## Lets get some data using spatialLIBD
sce_layer <- spatialLIBD::fetch_data("sce_layer")
#> adding rname 'https://www.dropbox.com/s/bg8xwysh2vnjwvg/Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata?dl=1'
-#> 2024-06-10 23:27:14.700913 loading file /github/home/.cache/R/BiocFileCache/3993f119bd3_Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata%3Fdl%3D1
+#> 2024-06-11 10:55:10.106743 loading file /github/home/.cache/R/BiocFileCache/4764df94b32_Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata%3Fdl%3D1
sce_layer
#> class: SingleCellExperiment
#> dim: 22331 76
diff --git a/interpreting-model-coefficients-with-exploremodelmatrix.html b/interpreting-model-coefficients-with-exploremodelmatrix.html
index 840e15c..3865b2f 100644
--- a/interpreting-model-coefficients-with-exploremodelmatrix.html
+++ b/interpreting-model-coefficients-with-exploremodelmatrix.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
diff --git a/introduction-to-spatial-transcriptomics.html b/introduction-to-spatial-transcriptomics.html
index 9e3c727..d265f5e 100644
--- a/introduction-to-spatial-transcriptomics.html
+++ b/introduction-to-spatial-transcriptomics.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
diff --git a/making-your-own-website-with-postcards.html b/making-your-own-website-with-postcards.html
index faf2090..1657705 100644
--- a/making-your-own-website-with-postcards.html
+++ b/making-your-own-website-with-postcards.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
@@ -401,131 +418,131 @@ 14.1 here## Install the package manually
-# install.packages("here")
-
-## Load "here" (previously installed)
-library("here")
## Install the package manually
+# install.packages("here")
+
+## Load "here" (previously installed)
+library("here")
Sometimes there might be an error, as it might clash with other packages (like plyr
). To avoid this, we can use here::here()
(which basically clarifies that the requested function is from the here
package).
Some useful commands are getwd()
and setwd()
, which deal with the working directory, which is the default location where R looks for files to read or save.
getwd()
retrieves the current working directory.
setwd()
allows changing the current working directory.
-Best Practice:
Instead of using “setwd” to manually set your working directory, it is often better to use the “here” package. Using “here” avoids issues with hard-coded paths and ensures your scripts work regardless of the specific setup of your working environment.
## Instead of "C:/Users/user/Desktop/data/myfile.csv"
-
-## Use here to construct file paths
-file_path <- here("Users", "user", "Desktop", "data", "myfile.csv")
-# file_path <- here:here("Users", "user", "Desktop","data", "myfile.csv")
-data <- read.csv(file_path)
## Instead of "C:/Users/user/Desktop/data/myfile.csv"
+
+## Use here to construct file paths
+file_path <- here("Users", "user", "Desktop", "data", "myfile.csv")
+# file_path <- here:here("Users", "user", "Desktop","data", "myfile.csv")
+data <- read.csv(file_path)
Other examples of how “here” could be used:
-## Example: save data to a file and load it
-a <- 1
-c <- 23
-
-save(a, c, file = here("test-data.RData"))
-# save(a, c, file = here:here("test-data.RData"))
-load(here("test-data.RData"))
-# load(here:here("test-data.RData"))
-
-## Create a directory
-dir.create(here("subdirectory"), showWarnings = FALSE)
-# dir.create(here:here("subdirectory"), showWarnings = FALSE)
-
-## Create a file, indicating the subdirectory (the first argument in this case)
-file.create(here("subdirectory", "filename"))
-#> [1] TRUE
# file.create(here:here("subdirectory", "filename"))
-
-## Open the new created file
-file.show(here("subdirectory", "filename"))
-# file.show(here:here("subdirectory", "filename"))
-
-## For example, if we want to see our files in the directory
-list.files(here(), recursive = TRUE)
-#> [1] "_main_files/figure-html/CCA-1.png"
-#> [2] "_main_files/figure-html/cut_dendogram-1.png"
-#> [3] "_main_files/figure-html/cut_dendogram-2.png"
-#> [4] "_main_files/figure-html/EMM_example1-1.png"
-#> [5] "_main_files/figure-html/heat map-1.png"
-#> [6] "_main_files/figure-html/hist_libSizeFactors-1.png"
-#> [7] "_main_files/figure-html/hist_p-1.png"
-#> [8] "_main_files/figure-html/modelGeneVar_batch-1.png"
-#> [9] "_main_files/figure-html/modelGeneVar_zeisel-1.png"
-#> [10] "_main_files/figure-html/modelGeneVarByPoisson_zeisel-1.png"
-#> [11] "_main_files/figure-html/modelGeneVarWithSpikes_416b-1.png"
-#> [12] "_main_files/figure-html/PCs_zeisel-1.png"
-#> [13] "_main_files/figure-html/plot_clusters_zeisel-1.png"
-#> [14] "_main_files/figure-html/plot_dendogram-1.png"
-#> [15] "_main_files/figure-html/Plot_multiplePCA_PCs-1.png"
-#> [16] "_main_files/figure-html/QC_sce416b_plots-1.png"
-#> [17] "_main_files/figure-html/runTSNE_zeisel-1.png"
-#> [18] "_main_files/figure-html/TSNE_perplexity_plots-1.png"
-#> [19] "_main_files/figure-html/Umap_zeisel-1.png"
-#> [20] "_main_files/figure-html/unnamed-chunk-14-1.png"
-#> [21] "_main_files/figure-html/unnamed-chunk-15-1.png"
-#> [22] "_main_files/figure-html/unnamed-chunk-16-1.png"
-#> [23] "_main_files/figure-html/unnamed-chunk-17-1.png"
-#> [24] "_main_files/figure-html/unnamed-chunk-18-1.png"
-#> [25] "_main_files/figure-html/unnamed-chunk-19-1.png"
-#> [26] "_main_files/figure-html/VarExplained_PCs-1.png"
-#> [27] "_main_files/figure-html/volcano plot-1.png"
-#> [28] "_main_files/figure-html/voom-1.png"
-#> [29] "_main.Rmd"
-#> [30] "01_SummarizedExperiment.R"
-#> [31] "01_SummarizedExperiment.Rmd"
-#> [32] "02_iSEE.R"
-#> [33] "02_iSEE.Rmd"
-#> [34] "03_recount3_intro.R"
-#> [35] "03_recount3_intro.Rmd"
-#> [36] "04_DGE_analysis_overview.R"
-#> [37] "04_DGE_analysis_overview.Rmd"
-#> [38] "05_DGE_with_limma_voom.R"
-#> [39] "05_DGE_with_limma_voom.Rmd"
-#> [40] "06_ExploreModelMatrix.R"
-#> [41] "06_ExploreModelMatrix.Rmd"
-#> [42] "07_model_variable_selection.R"
-#> [43] "07_model_variable_selection.Rmd"
-#> [44] "08_DEG_exercise.R"
-#> [45] "08_DEG_exercise.Rmd"
-#> [46] "09_research_talks.R"
-#> [47] "09_research_talks.Rmd"
-#> [48] "10_biocthis_intro.R"
-#> [49] "10_biocthis_intro.Rmd"
-#> [50] "11_scRNAseq_overview.R"
-#> [ reached getOption("max.print") -- omitted 58 entries ]
## Example: save data to a file and load it
+a <- 1
+c <- 23
+
+save(a, c, file = here("test-data.RData"))
+# save(a, c, file = here:here("test-data.RData"))
+load(here("test-data.RData"))
+# load(here:here("test-data.RData"))
+
+## Create a directory
+dir.create(here("subdirectory"), showWarnings = FALSE)
+# dir.create(here:here("subdirectory"), showWarnings = FALSE)
+
+## Create a file, indicating the subdirectory (the first argument in this case)
+file.create(here("subdirectory", "filename"))
+#> [1] TRUE
# file.create(here:here("subdirectory", "filename"))
+
+## Open the new created file
+file.show(here("subdirectory", "filename"))
+# file.show(here:here("subdirectory", "filename"))
+
+## For example, if we want to see our files in the directory
+list.files(here(), recursive = TRUE)
+#> [1] "_main_files/figure-html/assigned_vs_ann_heatmap-1.png"
+#> [2] "_main_files/figure-html/auc_explore_plots-1.png"
+#> [3] "_main_files/figure-html/CCA-1.png"
+#> [4] "_main_files/figure-html/cut_dendogram-1.png"
+#> [5] "_main_files/figure-html/cut_dendogram-2.png"
+#> [6] "_main_files/figure-html/EMM_example1-1.png"
+#> [7] "_main_files/figure-html/heat map-1.png"
+#> [8] "_main_files/figure-html/hist_libSizeFactors-1.png"
+#> [9] "_main_files/figure-html/hist_p-1.png"
+#> [10] "_main_files/figure-html/lessRes_clustering-1.png"
+#> [11] "_main_files/figure-html/modelGeneVar_batch-1.png"
+#> [12] "_main_files/figure-html/modelGeneVar_zeisel-1.png"
+#> [13] "_main_files/figure-html/modelGeneVarByPoisson_zeisel-1.png"
+#> [14] "_main_files/figure-html/modelGeneVarWithSpikes_416b-1.png"
+#> [15] "_main_files/figure-html/PCs_zeisel-1.png"
+#> [16] "_main_files/figure-html/plot_clusters_zeisel-1.png"
+#> [17] "_main_files/figure-html/plot_dendogram-1.png"
+#> [18] "_main_files/figure-html/plot_markergenes1-1.png"
+#> [19] "_main_files/figure-html/plot_markers_byblock-1.png"
+#> [20] "_main_files/figure-html/Plot_multiplePCA_PCs-1.png"
+#> [21] "_main_files/figure-html/plotDots_markers-1.png"
+#> [22] "_main_files/figure-html/predicted_vs_clusters_heatmap-1.png"
+#> [23] "_main_files/figure-html/QC_sce416b_plots-1.png"
+#> [24] "_main_files/figure-html/runTSNE_zeisel-1.png"
+#> [25] "_main_files/figure-html/set_PBMC_dataset-1.png"
+#> [26] "_main_files/figure-html/set_PBMC_dataset-2.png"
+#> [27] "_main_files/figure-html/top_markers_heatmap-1.png"
+#> [28] "_main_files/figure-html/TSNE_perplexity_plots-1.png"
+#> [29] "_main_files/figure-html/Umap_zeisel-1.png"
+#> [30] "_main_files/figure-html/unnamed-chunk-14-1.png"
+#> [31] "_main_files/figure-html/unnamed-chunk-15-1.png"
+#> [32] "_main_files/figure-html/unnamed-chunk-16-1.png"
+#> [33] "_main_files/figure-html/unnamed-chunk-17-1.png"
+#> [34] "_main_files/figure-html/unnamed-chunk-18-1.png"
+#> [35] "_main_files/figure-html/unnamed-chunk-19-1.png"
+#> [36] "_main_files/figure-html/VarExplained_PCs-1.png"
+#> [37] "_main_files/figure-html/volcano plot-1.png"
+#> [38] "_main_files/figure-html/voom-1.png"
+#> [39] "_main.Rmd"
+#> [40] "01_SummarizedExperiment.R"
+#> [41] "01_SummarizedExperiment.Rmd"
+#> [42] "02_iSEE.R"
+#> [43] "02_iSEE.Rmd"
+#> [44] "03_recount3_intro.R"
+#> [45] "03_recount3_intro.Rmd"
+#> [46] "04_DGE_analysis_overview.R"
+#> [47] "04_DGE_analysis_overview.Rmd"
+#> [48] "05_DGE_with_limma_voom.R"
+#> [49] "05_DGE_with_limma_voom.Rmd"
+#> [50] "06_ExploreModelMatrix.R"
+#> [ reached getOption("max.print") -- omitted 68 entries ]
The usethis
package simplifies many common setup tasks and workflows in R. It helps streamline the process of creating new projects, setting up Git
repositories, and connecting with GitHub
. Mastering usethis
allows you to focus more on coding and less on configuration.
In this case, the package is already installed so we just need to load it.
-## Install the package manually
-# install.packages("usethis")
-
-## Load "usethis (previously installed)
-library("usethis")
## Install the package manually
+# install.packages("usethis")
+
+## Load "usethis (previously installed)
+library("usethis")
Usage:
All use_*()
functions operate on the current directory.
✔ indicates that usethis has setup everything for you. ● indicates that you’ll need to do some work yourself.
-## For example, create a README file
-usethis::use_readme_md()
-#> ✔ Setting active project to '/__w/cshl_rstats_genome_scale_2024/cshl_rstats_genome_scale_2024'
-#> ✔ Writing 'README.md'
## For example, create a README file
+usethis::use_readme_md()
+#> ✔ Setting active project to '/__w/cshl_rstats_genome_scale_2024/cshl_rstats_genome_scale_2024'
+#> ✔ Writing 'README.md'
More functions in usethis: usethis RDocumentation
In the following exercises, we will see some uses of usethis
.
After installing Git
, restart RStudio
to allow it to annex.
In this case, the packages are already installed so we just need to load them.
- +To connect our RStudio repository with GitHub
, we request a token, which allows GitHub
to grant permission to our computer.
You can request the token using R (choose a meaningful name).
-## Initiate connection with GitHub
-usethis::create_github_token() # redirects to GitHub where you'll choose a specific name for the token
## Initiate connection with GitHub
+usethis::create_github_token() # redirects to GitHub where you'll choose a specific name for the token
Copy the token to enter it later with gitcreds_set()
Another way to request the token is by going to GitHub Tokens, this option will provide a recommendation of the parameters to select.
The token expiration parameter can be changed so it does not expire (for security, GitHub
does not recommend this). Otherwise, consider its validity period.
You can always generate a new one (don’t forget to delete the previous token).
The next step is to configure our GitHub
user in the global .gitconfig
file:
Now let’s initialize the repository in Git
(locally on your computer) and then request to connect it with GitHub servers. Git
is the software while GitHub
is the web platform (based on Git
) that allows collaboration.
## Initialize the Git repository
-usethis::use_git()
-
-## Connect your local Git repository with GitHub servers
-usethis::use_github()
## Initialize the Git repository
+usethis::use_git()
+
+## Connect your local Git repository with GitHub servers
+usethis::use_github()
** Done **
Useful command to check configuration:
- +git_log()
git_push()
## Write a new file, using here::here to specify the path
-writeLines("hello", here::here("R", "test-here.R"))
-
-## Another way is to use use_r
-usethis::use_r("test-file-github.R") # adds file to the project's R directory
-
-## For example, we might try adding something new
-gert::git_add("R/test-file-github.R")
-
-## Add commit of what was done
-gert::git_commit("uploaded test file")
-
-## Gives info about the commits
-gert::git_log()
-
-## Upload your changes from the local repo to GitHub
-gert::git_push() # IMPORTANT COMMAND
## Write a new file, using here::here to specify the path
+writeLines("hello", here::here("R", "test-here.R"))
+
+## Another way is to use use_r
+usethis::use_r("test-file-github.R") # adds file to the project's R directory
+
+## For example, we might try adding something new
+gert::git_add("R/test-file-github.R")
+
+## Add commit of what was done
+gert::git_commit("uploaded test file")
+
+## Gives info about the commits
+gert::git_log()
+
+## Upload your changes from the local repo to GitHub
+gert::git_push() # IMPORTANT COMMAND
It might be more user-friendly to use the Git
pane that appears in RStudio :)
---
title: "Welcome to My Website"
author: "Your Name"
-date: "2024-06-10"
+date: "2024-06-11"
output: html_document
---
@@ -671,7 +688,7 @@ 14.4.2 2. Create index.Rmd for th
14.4.3 3. Render the Site
To render the site, use the rmarkdown::render_site()
function, which converts all R Markdown and Markdown files into HTML. The resulting HTML files and resources are placed in a directory, typically _site
. RStudio facilitates this process with tools like the “Knit” button for individual pages and the “Build” pane for the entire site.
Common elements, such as shared HTML files and CSS for styling, ensure consistency and avoid redundancy. A well-configured navigation bar enhances user experience by providing easy access to different sections.
-
+
14.4.4 4. Publish the Website
@@ -720,11 +737,11 @@ 14.5 postcards
14.5.1 Installation
In this case, the package is already installed.
-
+
14.5.2 Templates
@@ -774,24 +791,24 @@ 14.6.1 Create a New Project in RS
An R Markdown document with your site’s content
A sample photo you should replace (with your own)
-
+
14.6.3 Choose a Template
-## Choose only one template (the one you like the most)
-postcards::create_postcard(template = "jolla")
-postcards::create_postcard(template = "jolla-blue")
-postcards::create_postcard(template = "trestles")
-postcards::create_postcard(template = "onofre")
-postcards::create_postcard(template = "solana")
+## Choose only one template (the one you like the most)
+postcards::create_postcard(template = "jolla")
+postcards::create_postcard(template = "jolla-blue")
+postcards::create_postcard(template = "trestles")
+postcards::create_postcard(template = "onofre")
+postcards::create_postcard(template = "solana")
In this way, you will also get the 2 important files:
- An R Markdown document with your site’s content
@@ -809,13 +826,13 @@ 14.6.5 Deploy the Page## Deploy the GitHub page
-rmarkdown::render("index.Rmd")
+
** Done **
-
-14.7 References
+
+14.7 References
- https://comunidadbioinfo.github.io/cdsb2021_scRNAseq/ejercicio-usando-usethis-here-y-postcards.html#vinculando-rstudio-con-git-y-github
- https://here.r-lib.org/
diff --git a/re-use-of-bulk-rna-seq-methods-for-spatial-data-exercise.html b/re-use-of-bulk-rna-seq-methods-for-spatial-data-exercise.html
index 5ea427e..1d316f1 100644
--- a/re-use-of-bulk-rna-seq-methods-for-spatial-data-exercise.html
+++ b/re-use-of-bulk-rna-seq-methods-for-spatial-data-exercise.html
@@ -314,8 +314,25 @@
- 11.8.3 Hierarchical clustering
- 11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
@@ -397,24 +414,24 @@ 13 Re-use of bulk RNA-seq methods
13.1 Spatial registration
In 2023, Louise A. Huuki-Myers contributed a new vignette to spatialLIBD
as noted on the package news / changelog: http://research.libd.org/spatialLIBD/news/index.html#spatiallibd-1132.
You should be able to run without any issues the code Louise explained at http://research.libd.org/spatialLIBD/articles/guide_to_spatial_registration.html. This same information is displayed at https://bioconductor.org/packages/release/data/experiment/vignettes/spatialLIBD/inst/doc/guide_to_spatial_registration.html.
-## get reference layer enrichment statistics
-layer_modeling_results <- spatialLIBD::fetch_data(type = "modeling_results")
-#> adding rname 'https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1'
-#> 2024-06-10 23:30:27.833595 loading file /github/home/.cache/R/BiocFileCache/399798f9392_Human_DLPFC_Visium_modeling_results.Rdata%3Fdl%3D1
+## get reference layer enrichment statistics
+layer_modeling_results <- spatialLIBD::fetch_data(type = "modeling_results")
+#> adding rname 'https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1'
+#> 2024-06-11 11:00:48.91264 loading file /github/home/.cache/R/BiocFileCache/47672ee9738_Human_DLPFC_Visium_modeling_results.Rdata%3Fdl%3D1
If the above doesn’t work, related to the curl
issue we previously discussed, then use this workaround:
-tmp_modeling_results <- tempfile("modeling_results.RData")
-download.file(
- "https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1",
- tmp_modeling_results,
- mode = "wb"
-)
-load(tmp_modeling_results, verbose = TRUE)
-#> Loading objects:
-#> modeling_results
-
-## Let's rename the object into the name used in the
-## spatial registration vignette (from spatialLIBD)
-layer_modeling_results <- modeling_results
+tmp_modeling_results <- tempfile("modeling_results.RData")
+download.file(
+ "https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1",
+ tmp_modeling_results,
+ mode = "wb"
+)
+load(tmp_modeling_results, verbose = TRUE)
+#> Loading objects:
+#> modeling_results
+
+## Let's rename the object into the name used in the
+## spatial registration vignette (from spatialLIBD)
+layer_modeling_results <- modeling_results
This journal club style video of the main results of the spatialDLPFC
paper does explain the basics of spatial registration:
## Find your project of interest. Here we'll use
@@ -442,24 +459,24 @@ 3.2 Using recount3## Build a RangedSummarizedExperiment (RSE) object
## with the information at the gene level
rse_gene_SRP009615 <- create_rse(proj_info)
-#> 2024-06-10 23:27:39.53866 downloading and reading the metadata.
-#> 2024-06-10 23:27:40.23701 caching file sra.sra.SRP009615.MD.gz.
+#> 2024-06-11 10:55:37.221981 downloading and reading the metadata.
+#> 2024-06-11 10:55:37.724484 caching file sra.sra.SRP009615.MD.gz.
#> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.sra.SRP009615.MD.gz'
-#> 2024-06-10 23:27:42.632536 caching file sra.recount_project.SRP009615.MD.gz.
+#> 2024-06-11 10:55:39.000263 caching file sra.recount_project.SRP009615.MD.gz.
#> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_project.SRP009615.MD.gz'
-#> 2024-06-10 23:27:44.828246 caching file sra.recount_qc.SRP009615.MD.gz.
+#> 2024-06-11 10:55:40.337901 caching file sra.recount_qc.SRP009615.MD.gz.
#> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_qc.SRP009615.MD.gz'
-#> 2024-06-10 23:27:46.32683 caching file sra.recount_seq_qc.SRP009615.MD.gz.
+#> 2024-06-11 10:55:41.57176 caching file sra.recount_seq_qc.SRP009615.MD.gz.
#> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_seq_qc.SRP009615.MD.gz'
-#> 2024-06-10 23:27:47.879167 caching file sra.recount_pred.SRP009615.MD.gz.
+#> 2024-06-11 10:55:42.992307 caching file sra.recount_pred.SRP009615.MD.gz.
#> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_pred.SRP009615.MD.gz'
-#> 2024-06-10 23:27:49.07915 downloading and reading the feature information.
-#> 2024-06-10 23:27:49.605369 caching file human.gene_sums.G026.gtf.gz.
+#> 2024-06-11 10:55:43.849074 downloading and reading the feature information.
+#> 2024-06-11 10:55:44.457463 caching file human.gene_sums.G026.gtf.gz.
#> adding rname 'http://duffel.rail.bio/recount3/human/annotations/gene_sums/human.gene_sums.G026.gtf.gz'
-#> 2024-06-10 23:27:51.119758 downloading and reading the counts: 12 samples across 63856 features.
-#> 2024-06-10 23:27:51.705248 caching file sra.gene_sums.SRP009615.G026.gz.
+#> 2024-06-11 10:55:45.823854 downloading and reading the counts: 12 samples across 63856 features.
+#> 2024-06-11 10:55:46.293574 caching file sra.gene_sums.SRP009615.G026.gz.
#> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/gene_sums/15/SRP009615/sra.gene_sums.SRP009615.G026.gz'
-#> 2024-06-10 23:27:53.086475 constructing the RangedSummarizedExperiment (rse) object.
+#> 2024-06-11 10:55:47.329599 constructing the RangedSummarizedExperiment (rse) object.
## Explore the resulting object
rse_gene_SRP009615
#> class: RangedSummarizedExperiment
diff --git a/reference-keys.txt b/reference-keys.txt
index aa06de6..1e4c10e 100644
--- a/reference-keys.txt
+++ b/reference-keys.txt
@@ -132,7 +132,18 @@ adjusting-the-parameters
hierarchical-clustering
subclustering
marker-gene-detection
+scoring-markers-by-pairwise-comparisons
+effect-sizes-for-pairwise-comparisons
+summaries-of-pairwise-effects
+choose-top-markers
+handling-blocking-factors
+deconvobuddies
cell-type-annotation
+assigning-cell-labels-from-reference-data
+getting-ready-again
+using-existing-references
+assigning-cell-labels-from-gene-sets
+references-4
introduction-to-spatial-transcriptomics
visium-spatial-technology
spatial-data-visualization
@@ -166,4 +177,4 @@ set-up-git-and-github
choose-a-template
edit-with-your-information
deploy-the-page
-references-4
+references-5
diff --git a/research-talks.html b/research-talks.html
index 9b331f1..e2ae2d2 100644
--- a/research-talks.html
+++ b/research-talks.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
diff --git a/scrna-seq-data-analysis-overview.html b/scrna-seq-data-analysis-overview.html
index 65cceb2..3c0fea8 100644
--- a/scrna-seq-data-analysis-overview.html
+++ b/scrna-seq-data-analysis-overview.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session
@@ -1209,69 +1226,74 @@ 11.8.2 Adjusting the parameters#> clust.50
#> 1 2 3 4 5
#> 362 812 945 288 409
+
+## Plot TSNE coloured by cluster assignments again, now with clust.50 results
+colLabels(sce.zeisel) <- clust.50
+plotReducedDim(sce.zeisel, "TSNE", colour_by = "label")
+
Edge weighting scheme
Further tweaking can be performed by changing the edge weighting scheme during graph construction. Setting type = "number"
will weight edges based on the number of nearest neighbors that are shared between two cells. Similarly, type = "jaccard"
will weight edges according to the Jaccard index of the two sets of neighbors. We can also disable weighting altogether by using a simple
k-nearest neighbor graph, which is occasionally useful for downstream graph operations that do not support weights.
-## Cluster using the number of shared nearest neighbors (type="number")
-clust.num <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(type = "number")
-)
-table(clust.num)
-#> clust.num
-#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
-#> 128 161 129 457 128 116 78 309 397 205 60 96 70 62 35 13 46 51 30 31 52 28 15 58 34 27
-
-## Cluster using the Jaccard index (similarity between sample sets)
-clust.jaccard <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(type = "jaccard")
-)
-table(clust.jaccard)
-#> clust.jaccard
-#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
-#> 131 166 195 129 294 128 113 77 332 200 375 61 97 71 84 32 13 46 53 30 52 28 31 36 15 27
+## Cluster using the number of shared nearest neighbors (type="number")
+clust.num <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = NNGraphParam(type = "number")
+)
+table(clust.num)
+#> clust.num
+#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
+#> 128 161 129 457 128 116 78 309 397 205 60 96 70 62 35 13 46 51 30 31 52 28 15 58 34 27
-## Cluster without specifying a graph type (default method-KNNGraphParam)
-clust.none <- clusterCells(sce.zeisel,
+## Cluster using the Jaccard index (similarity between sample sets)
+clust.jaccard <- clusterCells(sce.zeisel,
use.dimred = "PCA",
- BLUSPARAM = KNNGraphParam()
+ BLUSPARAM = NNGraphParam(type = "jaccard")
)
-table(clust.none)
-#> clust.none
-#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
-#> 77 454 297 132 105 164 129 104 62 533 186 45 105 33 69 82 50 52 31 34 30 15 27
+table(clust.jaccard)
+#> clust.jaccard
+#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
+#> 131 166 195 129 294 128 113 77 332 200 375 61 97 71 84 32 13 46 53 30 52 28 31 36 15 27
+
+## Cluster without specifying a graph type (default method-KNNGraphParam)
+clust.none <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = KNNGraphParam()
+)
+table(clust.none)
+#> clust.none
+#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
+#> 77 454 297 132 105 164 129 104 62 533 186 45 105 33 69 82 50 52 31 34 30 15 27
Community detection
The community detection can be performed by using any of the algorithms provided by igraph
. The Walktrap approach is a common one, but many others are available to choose from:
-clust.walktrap <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(cluster.fun = "walktrap")
-)
-
-clust.louvain <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(cluster.fun = "louvain")
-)
-
-clust.infomap <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(cluster.fun = "infomap")
-)
-
-clust.fast <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(cluster.fun = "fast_greedy")
-)
-
-clust.labprop <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(cluster.fun = "label_prop")
-)
-
-clust.eigen <- clusterCells(sce.zeisel,
- use.dimred = "PCA",
- BLUSPARAM = NNGraphParam(cluster.fun = "leading_eigen")
-)
+clust.walktrap <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = NNGraphParam(cluster.fun = "walktrap")
+)
+
+clust.louvain <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = NNGraphParam(cluster.fun = "louvain")
+)
+
+clust.infomap <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = NNGraphParam(cluster.fun = "infomap")
+)
+
+clust.fast <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = NNGraphParam(cluster.fun = "fast_greedy")
+)
+
+clust.labprop <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = NNGraphParam(cluster.fun = "label_prop")
+)
+
+clust.eigen <- clusterCells(sce.zeisel,
+ use.dimred = "PCA",
+ BLUSPARAM = NNGraphParam(cluster.fun = "leading_eigen")
+)
11.8.3 Hierarchical clustering
@@ -1280,91 +1302,91 @@ 11.8.3 Hierarchical clusteringIn practice, hierarchical clustering is too slow to be used for anything but the smallest scRNA-seq datasets. Most implementations require a cell-cell distance matrix that is prohibitively expensive to compute for a large number of cells. Greedy agglomeration is also likely to result in a quantitatively suboptimal partitioning (as defined by the agglomeration measure) at higher levels of the dendrogram when the number of cells and merge steps is high
We use a HclustParam object to instruct clusterCells()
to perform hierarchical clustering on the top PCs. Specifically, it computes a cell-cell distance matrix using the top PCs and then applies Ward’s minimum variance method to obtain a dendrogram.
For this case, we will use the sce.416b
-library("scran")
-## Top 2000 HVGs
-top.416b <- getTopHVGs(sce.416b, n = 2000)
-## Principal component analysis using top 2000 HVGs, 50 PCs
-set.seed(100)
-sce.416b <- fixedPCA(sce.416b, subset.row = top.416b)
-## TSNE
-sce.416b <- runTSNE(sce.416b, dimred = "PCA")
-library("dendextend")
-#>
-#> ---------------------
-#> Welcome to dendextend version 1.17.1
-#> Type citation('dendextend') for how to cite the package.
-#>
-#> Type browseVignettes(package = 'dendextend') for the package vignette.
-#> The github page is: https://github.com/talgalili/dendextend/
-#>
-#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
-#> You may ask questions at stackoverflow, use the r and dendextend tags:
-#> https://stackoverflow.com/questions/tagged/dendextend
-#>
-#> To suppress this message use: suppressPackageStartupMessages(library(dendextend))
-#> ---------------------
-#>
-#> Attaching package: 'dendextend'
-#> The following object is masked from 'package:stats':
-#>
-#> cutree
-
-## Perform hierarchical clustering on the PCA-reduced data from sce.416b
-## The BLUSPARAM argument specifies the clustering method (here "ward.D2").
-## The full=TRUE argument ensures that additional objects related to clustering are returned.
-hclust.416b <- clusterCells(sce.416b,
- use.dimred = "PCA",
- BLUSPARAM = HclustParam(method = "ward.D2"), full = TRUE
-)
-
-## Extract the hierarchical clustering tree from the clustering result
-tree.416b <- hclust.416b$objects$hclust
-
-## Customize the dendrogram for better visualization
-tree.416b$labels <- seq_along(tree.416b$labels)
-## Convert the hierarchical clustering tree to a dendrogram object
-dend <- as.dendrogram(tree.416b, hang = 0.1)
-combined.fac <- paste0(
- sce.416b$block, ".",
- sub(" .*", "", sce.416b$phenotype)
-)
-
-labels_colors(dend) <- c(
- "20160113.wild" = "blue",
- "20160113.induced" = "red",
- "20160325.wild" = "dodgerblue",
- "20160325.induced" = "salmon"
-)[combined.fac][order.dendrogram(dend)]
-
-## Plot the dendrogram
-plot(dend)
+library("scran")
+## Top 2000 HVGs
+top.416b <- getTopHVGs(sce.416b, n = 2000)
+## Principal component analysis using top 2000 HVGs, 50 PCs
+set.seed(100)
+sce.416b <- fixedPCA(sce.416b, subset.row = top.416b)
+## TSNE
+sce.416b <- runTSNE(sce.416b, dimred = "PCA")
+library("dendextend")
+#>
+#> ---------------------
+#> Welcome to dendextend version 1.17.1
+#> Type citation('dendextend') for how to cite the package.
+#>
+#> Type browseVignettes(package = 'dendextend') for the package vignette.
+#> The github page is: https://github.com/talgalili/dendextend/
+#>
+#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
+#> You may ask questions at stackoverflow, use the r and dendextend tags:
+#> https://stackoverflow.com/questions/tagged/dendextend
+#>
+#> To suppress this message use: suppressPackageStartupMessages(library(dendextend))
+#> ---------------------
+#>
+#> Attaching package: 'dendextend'
+#> The following object is masked from 'package:stats':
+#>
+#> cutree
+
+## Perform hierarchical clustering on the PCA-reduced data from sce.416b
+## The BLUSPARAM argument specifies the clustering method (here "ward.D2").
+## The full=TRUE argument ensures that additional objects related to clustering are returned.
+hclust.416b <- clusterCells(sce.416b,
+ use.dimred = "PCA",
+ BLUSPARAM = HclustParam(method = "ward.D2"), full = TRUE
+)
+
+## Extract the hierarchical clustering tree from the clustering result
+tree.416b <- hclust.416b$objects$hclust
+
+## Customize the dendrogram for better visualization
+tree.416b$labels <- seq_along(tree.416b$labels)
+## Convert the hierarchical clustering tree to a dendrogram object
+dend <- as.dendrogram(tree.416b, hang = 0.1)
+combined.fac <- paste0(
+ sce.416b$block, ".",
+ sub(" .*", "", sce.416b$phenotype)
+)
+
+labels_colors(dend) <- c(
+ "20160113.wild" = "blue",
+ "20160113.induced" = "red",
+ "20160325.wild" = "dodgerblue",
+ "20160325.induced" = "salmon"
+)[combined.fac][order.dendrogram(dend)]
+
+## Plot the dendrogram
+plot(dend)
To obtain explicit clusters, we “cut” the tree by removing internal branches such that every subtree represents a distinct cluster. This is most simply done by removing internal branches above a certain height of the tree, as performed by the cutree()
function. A more sophisticated variant of this approach is implemented in the dynamicTreeCut
package, which uses the shape of the branches to obtain a better partitioning for complex dendrograms. We enable this option by setting cut.dynamic = TRUE
, with additional tweaking of the deepSplit
parameter to control the resolution of the resulting clusters.
-library("dynamicTreeCut")
-
-## Perform hierarchical clustering with dynamic tree cut on the PCA
-## The BLUSPARAM argument specifies the clustering method (here "ward.D2"),
-## and enables dynamic tree cut (cut.dynamic=TRUE) with specific parameters.
-hclust.dyn <- clusterCells(sce.416b,
- use.dimred = "PCA",
- BLUSPARAM = HclustParam(
- method = "ward.D2", cut.dynamic = TRUE,
- cut.params = list(minClusterSize = 10, deepSplit = 1)
- )
-)
-table(hclust.dyn)
-#> hclust.dyn
-#> 1 2 3 4
-#> 82 70 27 13
-
-## Plot dendogram
-labels_colors(dend) <- as.integer(hclust.dyn)[order.dendrogram(dend)]
-plot(dend)
-
+library("dynamicTreeCut")
+
+## Perform hierarchical clustering with dynamic tree cut on the PCA
+## The BLUSPARAM argument specifies the clustering method (here "ward.D2"),
+## and enables dynamic tree cut (cut.dynamic=TRUE) with specific parameters.
+hclust.dyn <- clusterCells(sce.416b,
+ use.dimred = "PCA",
+ BLUSPARAM = HclustParam(
+ method = "ward.D2", cut.dynamic = TRUE,
+ cut.params = list(minClusterSize = 10, deepSplit = 1)
+ )
+)
+table(hclust.dyn)
+#> hclust.dyn
+#> 1 2 3 4
+#> 82 70 27 13
-## Obtain assignations and plot TSNE
-colLabels(sce.416b) <- factor(hclust.dyn)
-plotReducedDim(sce.416b, "TSNE", colour_by = "label")
+## Plot dendogram
+labels_colors(dend) <- as.integer(hclust.dyn)[order.dendrogram(dend)]
+plot(dend)
+
+
+## Obtain assignations and plot TSNE
+colLabels(sce.416b) <- factor(hclust.dyn)
+plotReducedDim(sce.416b, "TSNE", colour_by = "label")
@@ -1374,9 +1396,783 @@ 11.8.4 Subclustering
11.9 Marker gene detection
+To interpret our clustering results, we need to identify the genes that drive separation between clusters. These marker genes allow us to assign biological meaning to each cluster based on their functional annotation. In the simplest case, we have a priori knowledge of the marker genes associated with particular cell types, allowing us to treat the clustering as a proxy for cell type identity.
+The same principle can be applied to discover more subtle differences between clusters (e.g., changes in activation or differentiation state) based on the behavior of genes in the affected pathways.
+The most straightforward approach to marker gene detection involves testing for differential expression between clusters. If a gene is strongly DE between clusters, it is likely to have driven the separation of cells in the clustering algorithm.
+Several methods are available to quantify the differences in expression profiles between clusters and obtain a single ranking of genes for each cluster.
+
+11.9.1 Scoring markers by pairwise comparisons
+Our general strategy is to compare each pair of clusters and compute scores quantifying the differences in the expression distributions between clusters. The scores for all pairwise comparisons involving a particular cluster are then consolidated into a single DataFrame
for that cluster.
+The scoreMarkers()
function from scran
returns a list of DataFrames, where each DataFrame corresponds to a cluster and each row of the DataFrame corresponds to a gene.
+In the DataFrame for cluster “X”, the columns contain
+
+- the “self.average”: the mean log-expression in “X”
+- “other.average”: the grand mean across all other clusters
+- self.detected: the proportion of cells with detected expression in “X”
+- other.detected: the mean detected proportion across all other clusters
+- a variety of effect size summaries generated from all pairwise comparisons involving “X”
+
+library("scran")
+
+## Scoring markers by pairwise comparisons
+marker.info <- scoreMarkers(sce.zeisel, colLabels(sce.zeisel))
+marker.info
+#> List of length 5
+#> names(5): 1 2 3 4 5
+
+## Statistics for cluster 1
+colnames(marker.info[["1"]])
+#> [1] "self.average" "other.average" "self.detected" "other.detected"
+#> [5] "mean.logFC.cohen" "min.logFC.cohen" "median.logFC.cohen" "max.logFC.cohen"
+#> [9] "rank.logFC.cohen" "mean.AUC" "min.AUC" "median.AUC"
+#> [13] "max.AUC" "rank.AUC" "mean.logFC.detected" "min.logFC.detected"
+#> [17] "median.logFC.detected" "max.logFC.detected" "rank.logFC.detected"
+For each cluster, we can then rank candidate markers based on one of these effect size summaries
+## Subset to the first cluster
+chosen <- marker.info[["1"]]
+
+## Rank candidate markers based on one of these effect size summaries
+ordered <- chosen[order(chosen$mean.AUC, decreasing=TRUE),]
+head(ordered[,1:4])
+#> DataFrame with 6 rows and 4 columns
+#> self.average other.average self.detected other.detected
+#> <numeric> <numeric> <numeric> <numeric>
+#> Cst3 6.16324 2.721622 0.977901 0.895103
+#> Sepp1 3.88488 0.830472 0.900552 0.297665
+#> B2m 3.09863 0.820891 0.820442 0.468411
+#> Sparcl1 5.54095 2.882289 0.930939 0.845173
+#> Zfp36l1 2.48297 0.261565 0.723757 0.151524
+#> Gng5 2.40613 0.603069 0.812155 0.320553
+library("scater")
+
+## Plot the marker gene expression by label
+plotExpression(sce.zeisel, features=head(rownames(ordered)),
+ x="label", colour_by="label")
+
+# Distribution of expression values across clusters for the top potential
+# marker genes (as determined by the mean AUC) for cluster 1
+Here, we deliberately use pairwise comparisons rather than comparing each cluster to the average of all other cells. The latter approach is sensitive to the population composition, which introduces an element of unpredictability to the marker sets due to variation in cell type abundances.
+In the worst case, the presence of one sub-population containing a majority of the cells will drive the selection of top markers for every other cluster, pushing out useful genes that can distinguish between the smaller sub-populations.
+
+
+11.9.2 Effect sizes for pairwise comparisons
+The AUC or Cohen’s d is usually the best choice for general purpose marker detection, as they are effective regardless of the magnitude of the expression values. The log-fold change in the detected proportion is specifically useful for identifying binary changes in expression.
+AUC
+In the context of marker detection, the area under the curve (AUC) quantifies our ability to distinguish between two distributions in a pairwise comparison. The AUC represents the probability that a randomly chosen observation from our cluster of interest is greater than a randomly chosen observation from the other cluster.
+
+- A value of 1 corresponds to upregulation, where all values of our cluster of interest are greater than any value from the other cluster
+- A value of 0.5 means that there is no net difference in the location of the distributions
+- A value of 0 corresponds to downregulation
+
+he AUC is closely related to the U-statistic in the Wilcoxon ranked sum test (a.k.a., Mann-Whitney U-test).
+## Subset the AUC from the candidate markers of cluster 1 info
+## and rank (by AUC)
+auc.only <- chosen[,grepl("AUC", colnames(chosen))]
+auc.only[order(auc.only$mean.AUC,decreasing=TRUE),]
+#> DataFrame with 19839 rows and 5 columns
+#> mean.AUC min.AUC median.AUC max.AUC rank.AUC
+#> <numeric> <numeric> <numeric> <numeric> <integer>
+#> Cst3 0.895187 0.858550 0.900951 0.920298 1
+#> Sepp1 0.873234 0.723951 0.918144 0.932695 1
+#> B2m 0.830457 0.794594 0.826349 0.874535 3
+#> Sparcl1 0.829280 0.799851 0.816875 0.883517 2
+#> Zfp36l1 0.828530 0.810392 0.827840 0.848047 3
+#> ... ... ... ... ... ...
+#> Hsp90aa1 0.1054120 0.0623657 0.0905663 0.1781496 19655
+#> Scg5 0.1033219 0.0193178 0.0312032 0.3315632 19249
+#> Snurf 0.0876293 0.0140424 0.0358926 0.2646899 19529
+#> [ reached getOption("max.print") -- omitted 2 rows ]
+Cohen’s d
+Cohen’s d is a standardized log-fold change where the difference in the mean log-expression between groups is scaled by the average standard deviation across groups. In other words, it is the number of standard deviations that separate the means of the two groups.
+The interpretation is similar to the log-fold change:
+
+- Positive values indicate that the gene is upregulated in our cluster of interest
+- Negative values indicate downregulation
+- values close to zero indicate that there is little difference.
+
+Cohen’s d is roughly analogous to the t-statistic in various two-sample t-tests.
+## Subset the "logFC.cohen" from the candidate markers of cluster 1 info
+## and rank (by Cohen’s d)
+cohen.only <- chosen[,grepl("logFC.cohen", colnames(chosen))]
+cohen.only[order(cohen.only$mean.logFC.cohen,decreasing=TRUE),]
+#> DataFrame with 19839 rows and 5 columns
+#> mean.logFC.cohen min.logFC.cohen median.logFC.cohen max.logFC.cohen rank.logFC.cohen
+#> <numeric> <numeric> <numeric> <numeric> <integer>
+#> Sepp1 2.00527 0.887739 2.29849 2.53635 1
+#> Cst3 1.77874 1.501392 1.82252 1.96852 1
+#> Gng5 1.66122 0.538738 1.94492 2.21630 2
+#> Zfp36l1 1.63278 1.459939 1.63000 1.81116 3
+#> Apoe 1.58840 1.309029 1.61529 1.81402 5
+#> ... ... ... ... ... ...
+#> Rab3a -2.80064 -3.94223 -3.35880 -0.542752 18990
+#> Mllt11 -2.83899 -4.53504 -3.18120 -0.458515 18737
+#> Acot7 -2.97134 -3.41241 -2.85837 -2.756219 19766
+#> [ reached getOption("max.print") -- omitted 2 rows ]
+log-fold change
+Finally, we also compute the log-fold change in the proportion of cells with detected expression between clusters. This ignores any information about the magnitude of expression, only considering whether any expression is detected at all. Again, positive values indicate that a greater proportion of cells express the gene in our cluster of interest compared to the other cluster.
+Note that a pseudo-count is added to avoid undefined log-fold changes when no cells express the gene in either group.
+## Subset the "logFC.detected" from the candidate markers of cluster 1 info
+## and rank (by log-fold change)
+detect.only <- chosen[,grepl("logFC.detected", colnames(chosen))]
+detect.only[order(detect.only$mean.logFC.detected,decreasing=TRUE),]
+#> DataFrame with 19839 rows and 5 columns
+#> mean.logFC.detected min.logFC.detected median.logFC.detected max.logFC.detected rank.logFC.detected
+#> <numeric> <numeric> <numeric> <numeric> <integer>
+#> Hhex 3.88701 3.18636 3.73608 4.88952 2
+#> Ly6f 3.80886 2.62872 3.98527 4.63618 3
+#> 9030619P08Rik 3.74819 2.62200 3.76970 4.83137 1
+#> Casp8 3.69939 2.49673 3.34698 5.60688 3
+#> Kcne1l 3.60801 2.17477 3.75120 4.75489 1
+#> ... ... ... ... ... ...
+#> Slc35f4 -4.98323 -5.85599 -5.40401 -3.26891 19695
+#> 2900079G21Rik -5.03133 -5.99641 -5.44274 -3.24343 19688
+#> D630023F18Rik -5.16180 -6.71692 -5.63179 -2.66670 19516
+#> [ reached getOption("max.print") -- omitted 2 rows ]
+The AUC or Cohen’s d is usually the best choice for general purpose marker detection, as they are effective regardless of the magnitude of the expression values. The log-fold change in the detected proportion is specifically useful for identifying binary changes in expression.
+
+
+11.9.3 Summaries of pairwise effects
+In a dataset with “N” clusters, each cluster is associated with “N” − 1 values for each type of effect size described. To simplify interpretation, we summarize the effects for each cluster into some key statistics such as the mean and median.
+Each summary statistic has a different interpretation when used for ranking:
+
+mean: The most obvious summary statistic is the mean. For cluster “X”, a large mean effect size (>0 for the log-fold changes, >0.5 for the AUCs) indicates that the gene is upregulated in “X” compared to the average of the other groups.
+median: a large value indicates that the gene is upregulated in “X” compared to most (>50%) other clusters. The median provides greater robustness to outliers than the mean, which may or may not be desirable. On one hand, the median avoids an inflated effect size if only a minority of comparisons have large effects; on the other hand, it will also overstate the effect size by ignoring a minority of comparisons that have opposing effects.
+minimum value: The minimum value (min.*) is the most stringent summary for identifying upregulated genes, as a large value indicates that the gene is upregulated in “X” compared to all other clusters. Conversely, if the minimum is small (<0 for the log-fold changes, <0.5 for the AUCs), we can conclude that the gene is downregulated in “X” compared to at least one other cluster.
+maximum value: The maximum value (max.*) is the least stringent summary for identifying upregulated genes, as a large value can be obtained if there is strong upregulation in “X” compared to any other cluster. Conversely, if the maximum is small, we can conclude that the gene is downregulated in “X” compared to all other clusters.
+minimum rank: The minimum rank, a.k.a., “min-rank” (rank.*) is the smallest rank of each gene across all pairwise comparisons. Specifically, genes are ranked within each pairwise comparison based on decreasing effect size, and then the smallest rank across all comparisons is reported for each gene. If a gene has a small min-rank, we can conclude that it is one of the top upregulated genes in at least one comparison of “X” to another cluster.
+
+Each of these summaries is computed for each effect size, for each gene, and for each cluster.
+Our next step is to choose one of these summary statistics for one of the effect sizes and to use it to rank the rows of the DataFrame. For identifying upregulated genes, ranking by the minimum is the most stringent and the maximum is the least stringent; the mean and median fall somewhere in between and are reasonable defaults for most applications.
+
+
+11.9.4 Choose top markers
+To continue or example, we will use the the median Cohen’s d to obtain a ranking of the marker genes
+Now that we have them ranked, we can choose how many of them are interesting to us. For this example, wi will stay with only the set of markers in which Cohen’s d derived min-ranks is less than or equal to 5.
+## Order the candidate markers by "rank.logFC.cohen" for each cluster
+ordered <- chosen[order(chosen$rank.logFC.cohen),]
+
+## Choose the top marker genes for each cluster
+top.ranked <- ordered[ordered$rank.logFC.cohen <= 10,]
+rownames(top.ranked) # Gene names
+#> [1] "Cst3" "Sepp1" "Gng5" "Sparcl1" "B2m" "Zfp36l1" "Atp1a2" "Qk" "Apoe" "Id3"
+#> [11] "Sat1" "Sparc" "Cd63" "Epas1" "Slco1c1" "Glul" "Gstm1" "Mt1" "Serpine2"
+We can also plot the expression in a Heat Map:
+## Plot a heatmap for the expression of some top marker genes for each cluster
+plotGroupedHeatmap(sce.zeisel, features=rownames(top.ranked), group="label",
+ center=TRUE, zlim=c(-3, 3))
+
+### Using a log-fold change threshold
+The Cohen’s d and AUC calculations consider both the magnitude of the difference between clusters as well as the variability within each cluster.
+If the variability is lower, it is possible for a gene to have a large effect size even if the magnitude of the difference is small.
+These genes tend to be somewhat uninformative for cell type identification despite their strong differential expression (e.g., ribosomal protein genes). We would prefer genes with larger log-fold changes between clusters, even if they have higher variability.
+To favor the detection of such genes, we can compute the effect sizes relative to a log-fold change threshold by setting lfc= in scoreMarkers().
+## Scoring markers by pairwise comparisons (effect sizes relative to a log-fold change)
+marker.info.lfc <- scoreMarkers(sce.zeisel, colLabels(sce.zeisel), lfc=2)
+
+## Statistics for cluster 1
+chosen2 <- marker.info.lfc[["1"]]
+## Rank info from cluster 1 by mean.AUC
+chosen2 <- chosen2[order(chosen2$mean.AUC, decreasing=TRUE),]
+chosen2[,c("self.average", "other.average", "mean.AUC")] # Check "self.average", "other.average", "mean.AUC"
+#> DataFrame with 19839 rows and 3 columns
+#> self.average other.average mean.AUC
+#> <numeric> <numeric> <numeric>
+#> Sepp1 3.88488 0.830472 0.716022
+#> Cst3 6.16324 2.721622 0.706665
+#> Atp1a2 4.54695 0.995355 0.680076
+#> Apoe 4.44418 0.619766 0.667839
+#> Sparc 3.71278 0.625208 0.642240
+#> ... ... ... ...
+#> Zscan4b 0.00000000 0.000835435 0
+#> Zscan4e 0.00000000 0.000977365 0
+#> Zscan4f 0.00000000 0.001098376 0
+#> Zswim5 0.01622302 0.059746058 0
+#> Zyg11a 0.00451334 0.001709500 0
+We can also create something a little bit different. Here we have a dot plot of the top potential marker genes (as determined by the mean AUC) for cluster 1.
+
+- Each row corrresponds to a marker gene and each column corresponds to a cluster.
+- The size of each dot represents the proportion of cells with detected expression of the gene in the cluster.
+- The color is proportional to the average expression across all cells in that cluster.
+
+
+
+
+
+11.9.5 Handling blocking factors
+Some studies may contain factors of variation that are known and not interesting (e.g., batch effects, sex differences). If these are not modelled, they can interfere with marker gene detection, most obviously by inflating the variance within each cluster, but also by distorting the log-fold changes if the cluster composition varies across levels of the blocking factor.
+To avoid these issues, we specify the blocking factor via the block= argument
+## Scoring markers by pairwise comparisons using a block factor (tissue)
+m.out <- scoreMarkers(sce.zeisel, colLabels(sce.zeisel), block=sce.zeisel$tissue)
+For each gene, each pairwise comparison between clusters is performed separately in each level of the blocking factor - in this case, the plate of origin. By comparing within each batch, we cancel out any batch effects so that they are not conflated with the biological differences between subpopulations. The effect sizes are then averaged across batches to obtain a single value per comparison, using a weighted mean that accounts for the number of cells involved in the comparison in each batch.
+A similar correction is applied to the mean log-expression and proportion of detected cells inside and outside each cluster.
+## Subset the info for cluster 1
+demo <- m.out[["1"]]
+## Order by the log-expression (which had a correction using block=sex)
+ordered <- demo[order(demo$median.logFC.cohen, decreasing=TRUE),]
+ordered[,1:4]
+#> DataFrame with 19839 rows and 4 columns
+#> self.average other.average self.detected other.detected
+#> <numeric> <numeric> <numeric> <numeric>
+#> Sepp1 3.50217 0.528293 0.825712 0.209078
+#> Gng5 2.37150 0.575725 0.800459 0.311615
+#> Cst3 6.45770 2.954119 0.981201 0.904447
+#> Apoe 4.66651 0.795313 0.704185 0.186789
+#> Zfp36l1 2.49179 0.268529 0.735299 0.156745
+#> ... ... ... ... ...
+#> Syp 0.238229 2.20762 0.148603 0.814644
+#> Rab3a 0.475216 2.62944 0.219682 0.878967
+#> Snap25 1.229693 4.40115 0.433556 0.882369
+#> Stmn3 0.614814 3.75383 0.204414 0.840670
+#> Ndrg4 0.664605 3.23414 0.240788 0.825395
+We can also plot our top marker genes expression now coloured by the block factor we used, in this case “tissue”.
+## In case we don´t have them as factors for the coloring
+sce.zeisel$tissue <- as.factor(sce.zeisel$tissue)
+## Plot the top marker genes expression by tissue
+plotExpression(sce.zeisel, features=rownames(ordered)[1:6],
+ x="label", colour_by="tissue")
+
+The block= argument works for all effect sizes shown above and is robust to differences in the log-fold changes or variance between batches.
+However, it assumes that each pair of clusters is present in at least one batch. In scenarios where cells from two clusters never co-occur in the same batch, the associated pairwise comparison will be impossible and is ignored during calculation of summary statistics.
+
+
+11.9.6 Deconvobuddies
+if (!requireNamespace("BiocManager", quietly = TRUE)) {
+ install.packages("BiocManager")
+}
+
+BiocManager::install("DeconvoBuddies")
+findMarkers_1vAll
+Calculate 1 vs. All standard fold change for each gene x cell type, wrapper function for scran::findMarkers
+https://research.libd.org/DeconvoBuddies/reference/findMarkers_1vAll.html
+get_mean_ratio2
+Calculate the mean ratio value and rank for each gene for each cell type in the sce object, to identify effective marker genes
+https://research.libd.org/DeconvoBuddies/reference/get_mean_ratio2.html
+
11.10 Cell type annotation
+The most challenging task in scRNA-seq data analysis is arguably the interpretation of the results. Obtaining clusters of cells is fairly straightforward, but it is more difficult to determine what biological state is represented by each of those clusters. Doing so requires us to bridge the gap between the current dataset and prior biological knowledge, and the latter is not always available in a consistent and quantitative manner.
+Even the concept of a “cell type” is not clearly defined :(
+Interpretation of scRNA-seq data is often manual and a common bottleneck in the analysis workflow.
+To expedite this step, we can try to use various computational approaches that exploit prior information to assign meaning to an uncharacterized scRNA-seq dataset.
+The most obvious sources of prior information are the curated gene sets associated with particular biological processes, e.g., from the Gene Ontology (GO) or the Kyoto Encyclopedia of Genes and Genomes (KEGG) collections. Alternatively, we could directly compare our expression profiles to published reference datasets where each sample or cell has already been annotated with its putative biological state by domain experts.
+It is important to have in mind that this step will largely depend on the amount of previous biological knowledge for your specific data set.
+The most obvious sources of prior information are the curated gene sets associated with particular biological processes, e.g., from the Gene Ontology (GO) or the Kyoto Encyclopedia of Genes and Genomes (KEGG) collections.
+Alternatively, we can directly compare our expression profiles to published reference datasets where each sample or cell has already been annotated with its putative biological state by domain experts.
+
+11.10.1 Assigning cell labels from reference data
+A conceptually straightforward annotation approach is to compare the single-cell expression profiles with previously annotated reference datasets. Labels can then be assigned to each cell in our uncharacterized test dataset based on the most similar reference sample(s). This is a standard classification challenge that can be tackled by standard machine learning techniques such as random forests and support vector machines.
+Any published and labelled RNA-seq dataset (bulk or single-cell) can be used as a reference
+SingleR method
+The SingleR method (Aran et al. 2019) for cell type annotation assigns labels to cells based on the reference samples with the highest Spearman rank correlations, using only the marker genes between pairs of labels to focus on the relevant differences between cell types.
+It also performs a fine-tuning step for each cell where the correlations are recomputed with just the marker genes for the top-scoring labels. This aims to resolve any ambiguity between those labels by removing noise from irrelevant markers for other labels. Further details can be found in the SingleR book.
+
+
+
+11.11 Getting ready again
+We will use now one of the 10X PBMC datasets as our test. While will apply quality control, normalization and clustering for this dataset, altough this is not strictly necessary.
+It is entirely possible to run SingleR()
on the raw counts without any a priori quality control and filter on the annotation results at one’s leisure
+## Load data
+library("DropletTestFiles")
+raw.path <- getTestFile("tenx-2.1.0-pbmc4k/1.0.0/raw.tar.gz")
+#> see ?DropletTestFiles and browseVignettes('DropletTestFiles') for documentation
+#> downloading 1 resources
+#> retrieving 1 resource
+#> loading from cache
+out.path <- file.path(tempdir(), "pbmc4k")
+untar(raw.path, exdir=out.path)
+
+library("DropletUtils")
+fname <- file.path(out.path, "raw_gene_bc_matrices/GRCh38")
+sce.pbmc <- read10xCounts(fname, col.names=TRUE)
+
+library("scater")
+rownames(sce.pbmc) <- uniquifyFeatureNames(
+ rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol)
+
+library("EnsDb.Hsapiens.v86")
+location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID,
+ column="SEQNAME", keytype="GENEID")
+#> Warning: Unable to map 144 of 33694 requested IDs.
+
+### QC
+set.seed(100)
+e.out <- emptyDrops(counts(sce.pbmc))
+sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)]
+unfiltered <- sce.pbmc
+stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=which(location=="MT")))
+high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
+sce.pbmc <- sce.pbmc[,!high.mito]
+summary(high.mito)
+#> Mode FALSE TRUE
+#> logical 3985 315
+
+### Normalization
+library("scran")
+set.seed(1000)
+clusters <- quickCluster(sce.pbmc)
+sce.pbmc <- computeSumFactors(sce.pbmc, cluster=clusters)
+sce.pbmc <- logNormCounts(sce.pbmc)
+summary(sizeFactors(sce.pbmc))
+#> Min. 1st Qu. Median Mean 3rd Qu. Max.
+#> 0.00749 0.71207 0.87490 1.00000 1.09900 12.25412
+
+### Variance modelling
+set.seed(1001)
+dec.pbmc <- modelGeneVarByPoisson(sce.pbmc)
+top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)
+
+### Dimensionality reduction
+set.seed(10000)
+sce.pbmc <- denoisePCA(sce.pbmc, subset.row=top.pbmc, technical=dec.pbmc)
+set.seed(100000)
+sce.pbmc <- runTSNE(sce.pbmc, dimred="PCA")
+set.seed(1000000)
+sce.pbmc <- runUMAP(sce.pbmc, dimred="PCA")
+
+### Clustering
+g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA')
+clust <- igraph::cluster_walktrap(g)$membership
+colLabels(sce.pbmc) <- factor(clust)
+table(colLabels(sce.pbmc))
+#>
+#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+#> 205 731 617 56 541 352 125 46 819 47 153 61 129 87 16
+
+
+
+### Interpretation
+markers <- findMarkers(sce.pbmc, pval.type="some", direction="up")
+marker.set <- markers[["8"]]
+as.data.frame(marker.set[1:30,1:3])
+#> p.value FDR summary.logFC
+#> PF4 5.234138e-32 1.763591e-27 6.862880
+#> TMSB4X 3.502960e-25 5.901437e-21 3.129070
+#> TAGLN2 2.055571e-24 2.308680e-20 4.771441
+#> NRGN 1.005824e-22 8.472562e-19 5.007984
+#> SDPR 2.288275e-22 1.542023e-18 5.610425
+#> PPBP 7.961199e-20 4.470744e-16 6.500820
+#> GPX1 1.177137e-19 5.666066e-16 5.158546
+#> CCL5 5.637712e-19 2.374463e-15 5.316314
+#> GNG11 8.384893e-19 3.139118e-15 5.475652
+#> HIST1H2AC 2.660666e-18 8.964847e-15 5.532573
+#> TUBB1 7.919842e-18 2.425920e-14 4.987507
+#> ACTB 4.073237e-17 1.058163e-13 3.171552
+#> B2M 4.082661e-17 1.058163e-13 1.610689
+#> FTH1 2.973798e-14 7.157083e-11 3.425641
+#> RGS18 6.579466e-13 1.477924e-09 4.298459
+#> ACRBP 1.357416e-12 2.858549e-09 3.969306
+#> [ reached 'max' / getOption("max.print") -- omitted 14 rows ]
+plotExpression(sce.pbmc, features=c("CD14", "CD68",
+ "MNDA", "FCGR3A"), x="label", colour_by="label")
+
+
+11.11.1 Using existing references
+The celldex
contains a number of curated reference datasets, mostly assembled from bulk RNA-seq or microarray data of sorted cell types. These references are often good enough for most applications provided that they contain the cell types that are expected in the test population. Here, we will use a reference constructed from Blueprint and ENCODE data (Martens and Stunnenberg 2013; The ENCODE Project Consortium 2012); this is obtained by calling the BlueprintEncode()
function to construct a SummarizedExperiment containing log-expression values with curated labels for each sample.
+library("celldex")
+#>
+#> Attaching package: 'celldex'
+#> The following objects are masked from 'package:scRNAseq':
+#>
+#> fetchLatestVersion, fetchMetadata, listVersions
+
+ref <- BlueprintEncodeData()
+ref
+#> class: SummarizedExperiment
+#> dim: 19859 259
+#> metadata(0):
+#> assays(1): logcounts
+#> rownames(19859): TSPAN6 TNMD ... LINC00550 GIMAP1-GIMAP5
+#> rowData names(0):
+#> colnames(259): mature.neutrophil CD14.positive..CD16.negative.classical.monocyte ...
+#> epithelial.cell.of.umbilical.artery.1 dermis.lymphatic.vessel.endothelial.cell.1
+#> colData names(3): label.main label.fine label.ont
+We call the SingleR()
function to annotate each of our PBMCs with the main cell type labels from the Blueprint/ENCODE reference. This returns a DataFrame where each row corresponds to a cell in the test dataset and contains its label assignments. Alternatively, we could use the labels in ref$label.fine, which provide more resolution at the cost of speed and increased ambiguity in the assignments.
+library("SingleR")
+#>
+#> Attaching package: 'SingleR'
+#> The following objects are masked from 'package:celldex':
+#>
+#> BlueprintEncodeData, DatabaseImmuneCellExpressionData, HumanPrimaryCellAtlasData, ImmGenData,
+#> MonacoImmuneData, MouseRNAseqData, NovershternHematopoieticData
+
+pred <- SingleR(test=sce.pbmc, ref=ref, labels=ref$label.main)
+table(pred$labels)
+#>
+#> B-cells CD4+ T-cells CD8+ T-cells DC Eosinophils Erythrocytes HSC Monocytes NK cells
+#> 549 772 1275 1 1 6 14 1116 251
+Now, we can inspect the results using a heatmap of the per-cell and label scores. Ideally, each cell should exhibit a high score in one label relative to all of the others, indicating that the assignment to that label was unambiguous.
+In this particular case it is true for monocytes and B cells, whereas we see more ambiguity between CD4+ and CD8+ T cells (and to a lesser extent, NK cells).
+
+
+We now compare the assignments with the clustering results to determine the identity of each cluster using a Heatmap of the distribution of cells across labels and clusters in the 10X PBMC dataset.
+Here, several clusters are nested within the monocyte and B cell labels, indicating that the clustering represents finer subdivisions within the cell types.
+Interestingly, our clustering does not effectively distinguish between CD4+ and CD8+ T cell labels. This is probably due to the presence of other factors of heterogeneity within the T cell subpopulation (e.g., activation) that have a stronger influence on unsupervised methods than the a priori expected CD4+/CD8+ distinction.
+tab <- table(Assigned=pred$pruned.labels, Cluster=colLabels(sce.pbmc))
+
+# Adding a pseudo-count of 10 to avoid strong color jumps with just 1 cell.
+library(pheatmap)
+pheatmap(log2(tab+10), color=colorRampPalette(c("white", "blue"))(101))
+
+This highlights some of the differences between reference-based annotation and unsupervised clustering. The former explicitly focuses on aspects of the data that are known to be interesting, simplifying the process of biological interpretation. However, the cost is that the downstream analysis is restricted by the diversity and resolution of the available labels, a problem that is largely avoided by de novo identification of clusters.
+Applying both strategies to examine the agreement (or lack thereof) between reference label and cluster assignments could work for you. Any inconsistencies are not necessarily problematic due to the conceptual differences between the two approaches; indeed, one could use those discrepancies as the basis for further investigation to discover novel factors of variation in the data.
+We can also apply SingleR to single-cell reference datasets that are curated and supplied by the user. This is most obviously useful when we have an existing dataset that was previously (manually) annotated and we want to use that knowledge to annotate a new dataset in an automated manner.
+
+
+11.11.2 Assigning cell labels from gene sets
+A related strategy is to explicitly identify sets of marker genes that are highly expressed in each individual cell. This does not require matching of individual cells to the expression values of the reference dataset, which is faster and more convenient when only the identities of the markers are available.
+For this example, we will be using the neuronal cell type markers derived from the Zeisel et al. (2015) study.
+library("scran")
+
+wilcox.z <- pairwiseWilcox(sce.zeisel, sce.zeisel$level1class,
+ lfc=1, direction="up")
+
+markers.z <- getTopMarkers(wilcox.z$statistics, wilcox.z$pairs,
+ pairwise=FALSE, n=50)
+
+lengths(markers.z)
+#> astrocytes_ependymal endothelial-mural interneurons microglia oligodendrocytes
+#> 78 85 120 69 82
+#> pyramidal CA1 pyramidal SS
+#> 122 148
+And our test dataset will be another brain scRNA-seq experiment from Tasic et al. (2016).
+library("scRNAseq")
+
+sce.tasic <- TasicBrainData()
+sce.tasic
+#> class: SingleCellExperiment
+#> dim: 24058 1809
+#> metadata(0):
+#> assays(1): counts
+#> rownames(24058): 0610005C13Rik 0610007C21Rik ... mt_X57780 tdTomato
+#> rowData names(0):
+#> colnames(1809): Calb2_tdTpositive_cell_1 Calb2_tdTpositive_cell_2 ... Rbp4_CTX_250ng_2 Trib2_CTX_250ng_1
+#> colData names(12): mouse_line cre_driver_1 ... secondary_type aibs_vignette_id
+#> reducedDimNames(0):
+#> mainExpName: NULL
+#> altExpNames(1): ERCC
+We are using the AUCell
package to identify marker sets that are highly expressed in each cell. This method ranks genes by their expression values within each cell and constructs a response curve of the number of genes from each marker set that are present with increasing rank.
+It then computes the area under the curve (AUC) for each marker set, quantifying the enrichment of those markers among the most highly expressed genes in that cell. This is roughly similar to performing a Wilcoxon rank sum test between genes in and outside of the set, but involving only the top ranking genes by expression in each cell.
+library("GSEABase")
+#> Loading required package: annotate
+#> Loading required package: XML
+#>
+#> Attaching package: 'XML'
+#> The following object is masked from 'package:patchwork':
+#>
+#> free
+#> Loading required package: graph
+#>
+#> Attaching package: 'graph'
+#> The following object is masked from 'package:XML':
+#>
+#> addNode
+#> The following object is masked from 'package:circlize':
+#>
+#> degree
+#> The following object is masked from 'package:stringr':
+#>
+#> boundary
+library("AUCell")
+
+all.sets <- lapply(names(markers.z), function(x) {
+ GeneSet(markers.z[[x]], setName=x)
+})
+all.sets <- GeneSetCollection(all.sets)
+
+rankings <- AUCell_buildRankings(counts(sce.tasic),
+ plotStats=FALSE, verbose=FALSE)
+
+cell.aucs <- AUCell_calcAUC(all.sets, rankings)
+#> Genes in the gene sets NOT available in the dataset:
+#> endothelial-mural: 8 (9% of 85)
+#> interneurons: 1 (1% of 120)
+#> oligodendrocytes: 2 (2% of 82)
+#> pyramidal CA1: 4 (3% of 122)
+#> pyramidal SS: 4 (3% of 148)
+
+results <- t(assay(cell.aucs))
+head(results)
+#> gene sets
+#> cells astrocytes_ependymal endothelial-mural interneurons microglia oligodendrocytes
+#> Calb2_tdTpositive_cell_1 0.1285798 0.04210738 0.5480712 0.04845394 0.1291290
+#> Calb2_tdTpositive_cell_2 0.1261887 0.04823270 0.4615967 0.02682648 0.1083978
+#> Calb2_tdTpositive_cell_3 0.1030379 0.07177445 0.3679172 0.03582241 0.1345914
+#> Calb2_tdTpositive_cell_4 0.1220786 0.04930379 0.5336098 0.05387632 0.1250108
+#> Calb2_tdTpositive_cell_5 0.1531630 0.06033829 0.5062161 0.06655747 0.1151828
+#> Calb2_tdTpositive_cell_6 0.1237204 0.09046280 0.3618004 0.03201310 0.1293656
+#> gene sets
+#> cells pyramidal CA1 pyramidal SS
+#> Calb2_tdTpositive_cell_1 0.2306182 0.3381124
+#> Calb2_tdTpositive_cell_2 0.2033824 0.2716657
+#> Calb2_tdTpositive_cell_3 0.3217893 0.5137783
+#> Calb2_tdTpositive_cell_4 0.2569572 0.3441631
+#> Calb2_tdTpositive_cell_5 0.2109269 0.3030309
+#> Calb2_tdTpositive_cell_6 0.4041339 0.5251548
+We assign cell type identity to each cell in the test dataset by taking the marker set with the top AUC as the label for that cell. Our new labels mostly agree with the original annotation from Tasic et al. (2016), which is encouraging :)
+The only exception involves misassignment of oligodendrocyte precursors to astrocytes, which may be understandable given that they are derived from a common lineage. In the absence of prior annotation, a more general diagnostic check is to compare the assigned labels to cluster identities, under the expectation that most cells of a single cluster would have the same label (or, if multiple labels are present, they should at least represent closely related cell states).
+new.labels <- colnames(results)[max.col(results)]
+tab <- table(new.labels, sce.tasic$broad_type)
+tab
+#>
+#> new.labels Astrocyte Endothelial Cell GABA-ergic Neuron Glutamatergic Neuron Microglia Oligodendrocyte
+#> astrocytes_ependymal 43 2 0 0 0 0
+#> endothelial-mural 0 27 0 0 0 0
+#> interneurons 0 0 760 3 0 0
+#> microglia 0 0 0 0 22 0
+#> oligodendrocytes 0 0 1 0 0 38
+#> pyramidal SS 0 0 0 809 0 0
+#>
+#> new.labels Oligodendrocyte Precursor Cell Unclassified
+#> astrocytes_ependymal 21 4
+#> endothelial-mural 0 2
+#> interneurons 0 15
+#> microglia 0 1
+#> oligodendrocytes 1 0
+#> pyramidal SS 0 60
+As a diagnostic measure, we examine the distribution of AUCs across cells for each label.
+In heterogeneous populations, the distribution for each label should be bimodal with one high-scoring peak containing cells of that cell type and a low-scoring peak containing cells of other types.
+The gap between these two peaks can be used to derive a threshold for whether a label is “active” for a particular cell. (In this case, we simply take the single highest-scoring label per cell as the labels should be mutually exclusive.) In populations where a particular cell type is expected, lack of clear bimodality for the corresponding label may indicate that its gene set is not sufficiently informative.
+par(mfrow=c(3,3))
+AUCell_exploreThresholds(cell.aucs, plotHist=TRUE, assign=TRUE)
+#> $astrocytes_ependymal
+#> $astrocytes_ependymal$aucThr
+#> $astrocytes_ependymal$aucThr$selected
+#> minimumDens
+#> 0.04144623
+#>
+#> $astrocytes_ependymal$aucThr$thresholds
+#> threshold nCells
+#> Global_k1 0.20913180 93
+#> L_k2 0.20910138 93
+#> R_k3 0.57911351 43
+#> minimumDens 0.04144623 1808
+#>
+#> $astrocytes_ependymal$aucThr$comment
+#> [1] ""
+#>
+#>
+#> $astrocytes_ependymal$assignment
+#> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4"
+#> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8"
+#> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12"
+#> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16"
+#> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20"
+#> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24"
+#> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_28"
+#> [29] "Calb2_tdTpositive_cell_29" "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32"
+#> [33] "Calb2_tdTpositive_cell_33" "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36"
+#> [37] "Calb2_tdTpositive_cell_37" "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40"
+#> [41] "Calb2_tdTpositive_cell_41" "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44"
+#> [45] "Calb2_tdTpositive_cell_45" "Calb2_tdTpositive_cell_46" "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_48"
+#> [49] "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_50"
+#> [ reached getOption("max.print") -- omitted 1758 entries ]
+#>
+#>
+#> $`endothelial-mural`
+#> $`endothelial-mural`$aucThr
+#> $`endothelial-mural`$aucThr$selected
+#> R_k3
+#> 0.2463287
+#>
+#> $`endothelial-mural`$aucThr$thresholds
+#> threshold nCells
+#> Global_k1 0.1196348 125
+#> L_k2 0.1653355 53
+#> R_k3 0.2463287 28
+#>
+#> $`endothelial-mural`$aucThr$comment
+#> [1] "Few cells have high AUC values (0.018% cells with AUC> 0.20). "
+#>
+#>
+#> $`endothelial-mural`$assignment
+#> [1] "Ctgf_tdTpositive_cell_1" "Ctgf_tdTpositive_cell_2" "Ctgf_tdTpositive_cell_3" "Ctgf_tdTpositive_cell_4"
+#> [5] "Ctgf_tdTpositive_cell_5" "Ctgf_tdTpositive_cell_6" "Ctgf_tdTpositive_cell_7" "Ctgf_tdTpositive_cell_8"
+#> [9] "Ctgf_tdTpositive_cell_10" "Cux2_tdTnegative_cell_10" "Ndnf_tdTpositive_cell_1" "Ndnf_tdTpositive_cell_3"
+#> [13] "Ndnf_tdTpositive_cell_4" "Ndnf_tdTpositive_cell_5" "Ndnf_tdTpositive_cell_6" "Ndnf_tdTpositive_cell_7"
+#> [17] "Ndnf_tdTpositive_cell_8" "Ndnf_tdTpositive_cell_9" "Ndnf_tdTpositive_cell_10" "Ndnf_tdTpositive_cell_11"
+#> [21] "Ndnf_tdTpositive_cell_12" "Ndnf_tdTpositive_cell_20" "Ndnf_tdTpositive_cell_21" "Nos1_tdTpositive_cell_1"
+#> [25] "Nos1_tdTpositive_cell_28" "Nos1_tdTpositive_cell_54" "Nos1_tdTpositive_cell_66" "Ntsr1_tdTnegative_cell_29"
+#>
+#>
+#> $interneurons
+#> $interneurons$aucThr
+#> $interneurons$aucThr$selected
+#> minimumDens
+#> 0.2102008
+#>
+#> $interneurons$aucThr$thresholds
+#> threshold nCells
+#> Global_k1 0.4912184 482
+#> L_k2 0.2253667 1644
+#> R_k3 0.4116720 960
+#> minimumDens 0.2102008 1646
+#>
+#> $interneurons$aucThr$comment
+#> [1] "The right distribution is taller. "
+#>
+#>
+#> $interneurons$assignment
+#> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4"
+#> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8"
+#> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12"
+#> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16"
+#> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20"
+#> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24"
+#> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_29"
+#> [29] "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32" "Calb2_tdTpositive_cell_33"
+#> [33] "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36" "Calb2_tdTpositive_cell_37"
+#> [37] "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40" "Calb2_tdTpositive_cell_41"
+#> [41] "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44" "Calb2_tdTpositive_cell_45"
+#> [45] "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_51" "Calb2_tdTpositive_cell_52"
+#> [49] "Calb2_tdTpositive_cell_54" "Calb2_tdTpositive_cell_55"
+#> [ reached getOption("max.print") -- omitted 1596 entries ]
+#>
+#>
+#> $microglia
+#> $microglia$aucThr
+#> $microglia$aucThr$selected
+#> R_k3
+#> 0.4649278
+#>
+#> $microglia$aucThr$thresholds
+#> threshold nCells
+#> tenPercentOfMax 0.06006924 474
+#> Global_k1 0.09814942 88
+#> L_k2 0.11821099 58
+#> R_k3 0.46492785 23
+#>
+#> $microglia$aucThr$comment
+#> [1] "Few cells have high AUC values (0.013% cells with AUC> 0.20). "
+#>
+#>
+#> $microglia$assignment
+#> [1] "Cux2_tdTnegative_cell_5" "Cux2_tdTnegative_cell_6" "Cux2_tdTnegative_cell_12"
+#> [4] "Cux2_tdTnegative_cell_15" "Ntsr1_tdTnegative_cell_18" "Ntsr1_tdTnegative_cell_28"
+#> [7] "Rbp4_tdTnegative_cell_2" "Rbp4_tdTnegative_cell_3" "Rbp4_tdTnegative_cell_4"
+#> [10] "Rbp4_tdTnegative_cell_10" "Rbp4_tdTnegative_cell_14" "Rbp4_tdTnegative_cell_16"
+#> [13] "Rbp4_tdTnegative_cell_19" "Rbp4_tdTnegative_cell_21" "Rbp4_tdTnegative_cell_22"
+#> [16] "Rbp4_tdTnegative_cell_23" "Scnn1a-Tg3_tdTnegative_cell_2" "Scnn1a-Tg3_tdTnegative_cell_4"
+#> [19] "Scnn1a-Tg3_tdTnegative_cell_8" "Scnn1a-Tg3_tdTnegative_cell_12" "Scnn1a-Tg3_tdTnegative_cell_16"
+#> [22] "Scnn1a-Tg3_tdTnegative_cell_21" "Scnn1a-Tg3_tdTnegative_cell_24"
+#>
+#>
+#> $oligodendrocytes
+#> $oligodendrocytes$aucThr
+#> $oligodendrocytes$aucThr$selected
+#> R_k3
+#> 0.5673453
+#>
+#> $oligodendrocytes$aucThr$thresholds
+#> threshold nCells
+#> Global_k1 0.2062242 82
+#> L_k2 0.2302351 65
+#> R_k3 0.5673453 34
+#>
+#> $oligodendrocytes$aucThr$comment
+#> [1] "Few cells have high AUC values (0.048% cells with AUC> 0.20). "
+#>
+#>
+#> $oligodendrocytes$assignment
+#> [1] "Gad2_tdTpositive_cell_31" "Gad2_tdTpositive_cell_44" "Gad2_tdTpositive_cell_77"
+#> [4] "Ntsr1_tdTnegative_cell_3" "Ntsr1_tdTnegative_cell_5" "Ntsr1_tdTnegative_cell_13"
+#> [7] "Ntsr1_tdTnegative_cell_16" "Ntsr1_tdTnegative_cell_19" "Ntsr1_tdTnegative_cell_22"
+#> [10] "Ntsr1_tdTnegative_cell_23" "Ntsr1_tdTnegative_cell_24" "Ntsr1_tdTnegative_cell_32"
+#> [13] "Ntsr1_tdTnegative_cell_41" "Ntsr1_tdTnegative_cell_44" "Pvalb_tdTpositive_cell_81"
+#> [16] "Rbp4_tdTnegative_cell_1" "Rbp4_tdTnegative_cell_7" "Rbp4_tdTnegative_cell_12"
+#> [19] "Rbp4_tdTnegative_cell_15" "Rbp4_tdTnegative_cell_18" "Scnn1a-Tg2_tdTnegative_cell_3"
+#> [22] "Scnn1a-Tg2_tdTnegative_cell_17" "Scnn1a-Tg3_tdTnegative_cell_1" "Scnn1a-Tg3_tdTnegative_cell_3"
+#> [25] "Scnn1a-Tg3_tdTnegative_cell_6" "Scnn1a-Tg3_tdTnegative_cell_10" "Scnn1a-Tg3_tdTnegative_cell_13"
+#> [28] "Scnn1a-Tg3_tdTnegative_cell_14" "Scnn1a-Tg3_tdTnegative_cell_15" "Scnn1a-Tg3_tdTnegative_cell_18"
+#> [31] "Scnn1a-Tg3_tdTnegative_cell_19" "Scnn1a-Tg3_tdTnegative_cell_22" "Scnn1a-Tg3_tdTnegative_cell_23"
+#> [34] "Sst_tdTpositive_cell_19"
+#>
+#>
+#> $`pyramidal CA1`
+#> $`pyramidal CA1`$aucThr
+#> $`pyramidal CA1`$aucThr$selected
+#> minimumDens
+#> 0.1269473
+#>
+#> $`pyramidal CA1`$aucThr$thresholds
+#> threshold nCells
+#> Global_k1 0.3328189 542
+#> L_k2 0.6311996 0
+#> R_k3 0.1545174 1646
+#> minimumDens 0.1269473 1649
+#>
+#> $`pyramidal CA1`$aucThr$comment
+#> [1] "The global distribution overlaps the partial distributions. "
+#>
+#>
+#> $`pyramidal CA1`$assignment
+#> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4"
+#> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8"
+#> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12"
+#> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16"
+#> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20"
+#> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24"
+#> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_29"
+#> [29] "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32" "Calb2_tdTpositive_cell_33"
+#> [33] "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36" "Calb2_tdTpositive_cell_37"
+#> [37] "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40" "Calb2_tdTpositive_cell_41"
+#> [41] "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44" "Calb2_tdTpositive_cell_45"
+#> [45] "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_51" "Calb2_tdTpositive_cell_52"
+#> [49] "Calb2_tdTpositive_cell_54" "Calb2_tdTpositive_cell_55"
+#> [ reached getOption("max.print") -- omitted 1599 entries ]
+#>
+#>
+#> $`pyramidal SS`
+#> $`pyramidal SS`$aucThr
+#> $`pyramidal SS`$aucThr$selected
+#> minimumDens
+#> 0.1829744
+#>
+#> $`pyramidal SS`$aucThr$thresholds
+#> threshold nCells
+#> Global_k1 0.4960449 630
+#> L_k2 0.8889189 0
+#> R_k3 0.4359221 835
+#> minimumDens 0.1829744 1649
+#>
+#> $`pyramidal SS`$aucThr$comment
+#> [1] "The global distribution overlaps the partial distributions. "
+#>
+#>
+#> $`pyramidal SS`$assignment
+#> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4"
+#> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8"
+#> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12"
+#> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16"
+#> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20"
+#> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24"
+#> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_29"
+#> [29] "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32" "Calb2_tdTpositive_cell_33"
+#> [33] "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36" "Calb2_tdTpositive_cell_37"
+#> [37] "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40" "Calb2_tdTpositive_cell_41"
+#> [41] "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44" "Calb2_tdTpositive_cell_45"
+#> [45] "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_51" "Calb2_tdTpositive_cell_52"
+#> [49] "Calb2_tdTpositive_cell_54" "Calb2_tdTpositive_cell_55"
+#> [ reached getOption("max.print") -- omitted 1599 entries ]
+
+Interpretation of the AUCell results is most straightforward when the marker sets are mutually exclusive, as shown above for the cell type markers.
+In other applications, one might consider computing AUCs for gene sets associated with signalling or metabolic pathways. It is likely that multiple pathways will be active in any given cell, and it is tempting to use the AUCs to quantify this activity for comparison across cells.
+However, such comparisons must be interpreted with much caution as the AUCs are competitive values: any increase in one pathway’s activity will naturally reduce the AUCs for all other pathways, potentially resulting in spurious differences across the population.
+the advantage of the AUCell approach is that it does not require reference expression values. This is particularly useful when dealing with gene sets derived from the literature or other qualitative forms of biological knowledge.
+
+
+
+11.12 References
+Amezquita, R. A., Lun, A. T., Becht, E., Carey, V. J., Carpp, L. N., Geistlinger, L., … & Hicks, S. C. (2020). Orchestrating single-cell analysis with Bioconductor. Nature methods, 17(2), 137-145.
+Aran, D., A. P. Looney, L. Liu, E. Wu, V. Fong, A. Hsu, S. Chak, et al. 2019. “Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage.” Nat. Immunol. 20 (2): 163–72.
+Bach, K., S. Pensa, M. Grzelak, J. Hadfield, D. J. Adams, J. C. Marioni, and W. T. Khaled. 2017. “Differentiation dynamics of mammary epithelial cells revealed by single-cell RNA sequencing.” Nat Commun 8 (1): 2128.
+Martens, J. H., and H. G. Stunnenberg. 2013. “BLUEPRINT: mapping human blood cell epigenomes.” Haematologica 98 (10): 1487–9.
+Muraro, M. J., G. Dharmadhikari, D. Grun, N. Groen, T. Dielen, E. Jansen, L. van Gurp, et al. 2016. “A Single-Cell Transcriptome Atlas of the Human Pancreas.” Cell Syst 3 (4): 385–94.
+Segerstolpe, A., A. Palasantza, P. Eliasson, E. M. Andersson, A. C. Andreasson, X. Sun, S. Picelli, et al. 2016. “Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes.” Cell Metab. 24 (4): 593–607.
+Tasic, B., V. Menon, T. N. Nguyen, T. K. Kim, T. Jarsky, Z. Yao, B. Levi, et al. 2016. “Adult mouse cortical cell taxonomy revealed by single cell transcriptomics.” Nat. Neurosci. 19 (2): 335–46.
+The ENCODE Project Consortium. 2012. “An integrated encyclopedia of DNA elements in the human genome.” Nature 489 (7414): 57–74.
+Zeisel, A., A. B. Munoz-Manchado, S. Codeluppi, P. Lonnerberg, G. La Manno, A. Jureus, S. Marques, et al. 2015. “Brain structure. Cell types in the mouse cortex and hippocampus revealed by single-cell RNA-seq.” Science 347 (6226): 1138–42.
diff --git a/search_index.json b/search_index.json
index eb38d1a..757284f 100644
--- a/search_index.json
+++ b/search_index.json
@@ -1 +1 @@
-[["index.html", "Statistical Analysis of Genome Scale Data 2024 Overview Download course materials Code of Conduct Course Schedule External links Course Prerequisites R session information License", " Statistical Analysis of Genome Scale Data 2024 Leonardo Collado-Torres Overview Here you can find the files for the June 2024 Statistical Analysis of Genome Scale Data course at CSHL portion taught by Leo and his team (June 9-11). Instructor: Leonardo Collado-Torres, Twitter Teaching assistants: Daianna González Padilla, Twitter Melissa Mayén Quiroz, Twitter Thanks again Sean Davis for inviting us to help teach https://t.co/KulvuQ3XK8 at @cshlcourses!Our teaching materials are available at https://t.co/OP2YYZmqwh 📚 It’s an honor to teach with @lcgunam students & remote @LieberInstitute team members#rstats @Bioconductor pic.twitter.com/U4DzQuPvIn — 🇲🇽 Leonardo Collado-Torres (@lcolladotor) June 9, 2024 Download course materials Download the materials for this course with usethis::use_course('lcolladotor/cshl_rstats_genome_scale_2024') or view online at lcolladotor.github.io/cshl_rstats_genome_scale_2024. This command downloads a static version of the course materials. If you want to be able to easily download updates, we recommend using Git. Happy Git and GitHub for the useR is great for getting your computer ready to use Git and GitHub. If you already have a GitHub account, you can instead use this command to download the course: ## Download it the first time git clone https://github.com/lcolladotor/cshl_rstats_genome_scale_2024.git ## To update the contents, use: cd cshl_rstats_genome_scale_2024 git pull Or you could use the GitHub Desktop application. Code of Conduct We’ll follow the CSHL code of conduct as well as version 1.2.0 of the Bioconductor code of conduct bioconductor.github.io/bioc_coc_multilingual/. For reporting any violations of the code of conduct, report them to the Instructor and/or Course Coordinators. Course Schedule Local times in US Eastern See CSHLData2024 for the detailed schedule. External links CSHL course GitHub source code Slack LieberInstitute/template_project LIBD rstats club: check the public schedule 2023 course version Course Prerequisites Install R 4.4.x from CRAN then install the following R packages: ## For installing Bioconductor packages if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } ## Install required packages BiocManager::install( c( "usethis", ## Utilities "BiocFileCache", "RefManageR", "gitcreds", "gert", "gh", "here", "Hmisc", "biocthis", "lobstr", "postcards", "scater", "sessioninfo", "stringr", "SummarizedExperiment", ## Main containers / vis "iSEE", "edgeR", ## RNA-seq "ExploreModelMatrix", "limma", "smokingMouse", "recount3", "rlang", "scRNAseq", "airway", "pheatmap", ## Visualization "ggplot2", "ggrepel", "patchwork", "RColorBrewer", "ComplexHeatmap", "cowplot", "Polychrome", "spatialLIBD", ## Advanced "variancePartition" ) ) You will also need to install RStudio version 2024.04.0+735 or newer. R session information Details on the R version used for making this book. The source code is available at lcolladotor/cshl_rstats_genome_scale_2024. ## Load the package at the top of your script library("sessioninfo") ## Utilities library("BiocFileCache") library("BiocStyle") library("biocthis") library("gitcreds") library("gert") library("gh") library("here") library("lobstr") library("postcards") library("usethis") library("sessioninfo") ## Data library("smokingMouse") library("scRNAseq") ## Main containers / vis library("SummarizedExperiment") library("iSEE") ## RNA-seq library("airway") library("edgeR") library("ExploreModelMatrix") library("limma") library("recount3") ## QCA library("scater") ## Variance Partition library("variancePartition") ## Visualization: plots & text library("ComplexHeatmap") library("ggplot2") library("patchwork") library("pheatmap") library("RColorBrewer") library("Hmisc") library("stringr") library("cowplot") library("rlang") library("ggrepel") library("Polychrome") ## Spatial transcriptomics library("spatialLIBD") ## Reproducibility information options(width = 120) session_info() ## ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────── ## setting value ## version R version 4.4.0 (2024-04-24) ## os Ubuntu 22.04.4 LTS ## system x86_64, linux-gnu ## ui X11 ## language (EN) ## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz UTC ## date 2024-06-10 ## pandoc 3.1.13 @ /usr/bin/ (via rmarkdown) ## ## ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── ## package * version date (UTC) lib source ## abind 1.4-5 2016-07-21 [1] RSPM (R 4.4.0) ## airway * 1.24.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.base 1.4.1 2024-05-03 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.matrix 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.ranges 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.sce 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.schemas 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.se 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) ## AnnotationDbi 1.66.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## AnnotationFilter 1.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## AnnotationHub 3.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## aod 1.3.3 2023-12-13 [1] RSPM (R 4.4.0) ## askpass 1.2.0 2023-09-03 [2] RSPM (R 4.4.0) ## attempt 0.3.1 2020-05-03 [1] RSPM (R 4.4.0) ## backports 1.5.0 2024-05-23 [1] RSPM (R 4.4.0) ## base64enc 0.1-3 2015-07-28 [2] RSPM (R 4.4.0) ## beachmat 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## beeswarm 0.4.0 2021-06-01 [1] RSPM (R 4.4.0) ## benchmarkme 1.0.8 2022-06-12 [1] RSPM (R 4.4.0) ## benchmarkmeData 1.0.4 2020-04-23 [1] RSPM (R 4.4.0) ## Biobase * 2.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocFileCache * 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocGenerics * 0.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocIO 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocManager 1.30.23 2024-05-04 [2] CRAN (R 4.4.0) ## BiocNeighbors 1.22.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocParallel * 1.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocSingular 1.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocStyle * 2.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## biocthis * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocVersion 3.19.1 2024-04-17 [2] Bioconductor 3.19 (R 4.4.0) ## Biostrings 2.72.1 2024-06-02 [1] Bioconductor 3.19 (R 4.4.0) ## bit 4.0.5 2022-11-15 [1] RSPM (R 4.4.0) ## bit64 4.0.5 2020-08-30 [1] RSPM (R 4.4.0) ## bitops 1.0-7 2021-04-24 [1] RSPM (R 4.4.0) ## blob 1.2.4 2023-03-17 [1] RSPM (R 4.4.0) ## bookdown 0.39 2024-04-15 [1] RSPM (R 4.4.0) ## boot 1.3-30 2024-02-26 [3] CRAN (R 4.4.0) ## broom 1.0.6 2024-05-17 [1] RSPM (R 4.4.0) ## bslib 0.7.0 2024-03-29 [2] RSPM (R 4.4.0) ## cachem 1.1.0 2024-05-16 [2] RSPM (R 4.4.0) ## caTools 1.18.2 2021-03-28 [1] RSPM (R 4.4.0) ## checkmate 2.3.1 2023-12-04 [1] RSPM (R 4.4.0) ## circlize 0.4.16 2024-02-20 [1] RSPM (R 4.4.0) ## cli 3.6.2 2023-12-11 [2] RSPM (R 4.4.0) ## clue 0.3-65 2023-09-23 [1] RSPM (R 4.4.0) ## cluster 2.1.6 2023-12-01 [3] CRAN (R 4.4.0) ## codetools 0.2-20 2024-03-31 [3] CRAN (R 4.4.0) ## colorspace 2.1-0 2023-01-23 [1] RSPM (R 4.4.0) ## colourpicker 1.3.0 2023-08-21 [1] RSPM (R 4.4.0) ## ComplexHeatmap * 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## config 0.3.2 2023-08-30 [1] RSPM (R 4.4.0) ## corpcor 1.6.10 2021-09-16 [1] RSPM (R 4.4.0) ## cowplot * 1.1.3 2024-01-22 [1] RSPM (R 4.4.0) ## crayon 1.5.2 2022-09-29 [2] RSPM (R 4.4.0) ## credentials 2.0.1 2023-09-06 [2] RSPM (R 4.4.0) ## curl 5.2.1 2024-03-01 [1] RSPM (R 4.4.0) ## data.table 1.15.4 2024-03-30 [1] RSPM (R 4.4.0) ## DBI 1.2.3 2024-06-02 [1] RSPM (R 4.4.0) ## dbplyr * 2.5.0 2024-03-19 [1] RSPM (R 4.4.0) ## DelayedArray 0.30.1 2024-05-07 [1] Bioconductor 3.19 (R 4.4.0) ## DelayedMatrixStats 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## digest 0.6.35 2024-03-11 [2] RSPM (R 4.4.0) ## doParallel 1.0.17 2022-02-07 [1] RSPM (R 4.4.0) ## dotCall64 1.1-1 2023-11-28 [1] RSPM (R 4.4.0) ## dplyr 1.1.4 2023-11-17 [1] RSPM (R 4.4.0) ## DT 0.33 2024-04-04 [1] RSPM (R 4.4.0) ## edgeR * 4.2.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## ensembldb 2.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## EnvStats 2.8.1 2023-08-22 [1] RSPM (R 4.4.0) ## evaluate 0.23 2023-11-01 [2] RSPM (R 4.4.0) ## ExperimentHub 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## ExploreModelMatrix * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## fANCOVA 0.6-1 2020-11-13 [1] RSPM (R 4.4.0) ## fansi 1.0.6 2023-12-08 [2] RSPM (R 4.4.0) ## fastmap 1.2.0 2024-05-15 [2] RSPM (R 4.4.0) ## fields 15.2 2023-08-17 [1] RSPM (R 4.4.0) ## filelock 1.0.3 2023-12-11 [1] RSPM (R 4.4.0) ## foreach 1.5.2 2022-02-02 [1] RSPM (R 4.4.0) ## foreign 0.8-86 2023-11-28 [3] CRAN (R 4.4.0) ## Formula 1.2-5 2023-02-24 [1] RSPM (R 4.4.0) ## fs 1.6.4 2024-04-25 [2] RSPM (R 4.4.0) ## generics 0.1.3 2022-07-05 [1] RSPM (R 4.4.0) ## GenomeInfoDb * 1.40.1 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) ## GenomeInfoDbData 1.2.12 2024-05-26 [1] Bioconductor ## GenomicAlignments 1.40.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## GenomicFeatures 1.56.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## GenomicRanges * 1.56.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## gert * 2.0.1 2023-12-04 [2] RSPM (R 4.4.0) ## GetoptLong 1.0.5 2020-12-15 [1] RSPM (R 4.4.0) ## ggbeeswarm 0.7.2 2023-04-29 [1] RSPM (R 4.4.0) ## ggplot2 * 3.5.1 2024-04-23 [1] RSPM (R 4.4.0) ## ggrepel * 0.9.5 2024-01-10 [1] RSPM (R 4.4.0) ## gh * 1.4.1 2024-03-28 [2] RSPM (R 4.4.0) ## gitcreds * 0.1.2 2022-09-08 [2] RSPM (R 4.4.0) ## GlobalOptions 0.1.2 2020-06-10 [1] RSPM (R 4.4.0) ## glue 1.7.0 2024-01-09 [2] RSPM (R 4.4.0) ## golem 0.4.1 2023-06-05 [1] RSPM (R 4.4.0) ## gplots 3.1.3.1 2024-02-02 [1] RSPM (R 4.4.0) ## gridExtra 2.3 2017-09-09 [1] RSPM (R 4.4.0) ## gtable 0.3.5 2024-04-22 [1] RSPM (R 4.4.0) ## gtools 3.9.5 2023-11-20 [1] RSPM (R 4.4.0) ## gypsum 1.0.1 2024-05-08 [1] Bioconductor 3.19 (R 4.4.0) ## HDF5Array 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## here * 1.0.1 2020-12-13 [1] RSPM (R 4.4.0) ## Hmisc * 5.1-3 2024-05-28 [1] RSPM (R 4.4.0) ## htmlTable 2.4.2 2023-10-29 [1] RSPM (R 4.4.0) ## htmltools 0.5.8.1 2024-04-04 [2] RSPM (R 4.4.0) ## htmlwidgets 1.6.4 2023-12-06 [2] RSPM (R 4.4.0) ## httpuv 1.6.15 2024-03-26 [2] RSPM (R 4.4.0) ## httr 1.4.7 2023-08-15 [2] RSPM (R 4.4.0) ## httr2 1.0.1 2024-04-01 [2] RSPM (R 4.4.0) ## igraph 2.0.3 2024-03-13 [1] RSPM (R 4.4.0) ## IRanges * 2.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## irlba 2.3.5.1 2022-10-03 [1] RSPM (R 4.4.0) ## iSEE * 2.16.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## iterators 1.0.14 2022-02-05 [1] RSPM (R 4.4.0) ## jquerylib 0.1.4 2021-04-26 [2] RSPM (R 4.4.0) ## jsonlite 1.8.8 2023-12-04 [2] RSPM (R 4.4.0) ## KEGGREST 1.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## KernSmooth 2.23-24 2024-05-17 [3] RSPM (R 4.4.0) ## knitr 1.47 2024-05-29 [2] RSPM (R 4.4.0) ## later 1.3.2 2023-12-06 [2] RSPM (R 4.4.0) ## lattice 0.22-6 2024-03-20 [3] CRAN (R 4.4.0) ## lazyeval 0.2.2 2019-03-15 [1] RSPM (R 4.4.0) ## lifecycle 1.0.4 2023-11-07 [2] RSPM (R 4.4.0) ## limma * 3.60.2 2024-05-19 [1] Bioconductor 3.19 (R 4.4.0) ## listviewer 4.0.0 2023-09-30 [1] RSPM (R 4.4.0) ## lme4 1.1-35.3 2024-04-16 [1] RSPM (R 4.4.0) ## lmerTest 3.1-3 2020-10-23 [1] RSPM (R 4.4.0) ## lobstr * 1.1.2 2022-06-22 [1] RSPM (R 4.4.0) ## locfit 1.5-9.9 2024-03-01 [1] RSPM (R 4.4.0) ## magick 2.8.3 2024-02-18 [1] RSPM (R 4.4.0) ## magrittr 2.0.3 2022-03-30 [2] RSPM (R 4.4.0) ## maps 3.4.2 2023-12-15 [1] RSPM (R 4.4.0) ## MASS 7.3-60.2 2024-05-06 [3] local ## Matrix 1.7-0 2024-03-22 [3] CRAN (R 4.4.0) ## MatrixGenerics * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## matrixStats * 1.3.0 2024-04-11 [1] RSPM (R 4.4.0) ## memoise 2.0.1 2021-11-26 [2] RSPM (R 4.4.0) ## mgcv 1.9-1 2023-12-21 [3] CRAN (R 4.4.0) ## mime 0.12 2021-09-28 [2] RSPM (R 4.4.0) ## miniUI 0.1.1.1 2018-05-18 [2] RSPM (R 4.4.0) ## minqa 1.2.7 2024-05-20 [1] RSPM (R 4.4.0) ## munsell 0.5.1 2024-04-01 [1] RSPM (R 4.4.0) ## mvtnorm 1.2-5 2024-05-21 [1] RSPM (R 4.4.0) ## nlme 3.1-165 2024-06-06 [3] RSPM (R 4.4.0) ## nloptr 2.0.3 2022-05-26 [1] RSPM (R 4.4.0) ## nnet 7.3-19 2023-05-03 [3] CRAN (R 4.4.0) ## numDeriv 2016.8-1.1 2019-06-06 [1] RSPM (R 4.4.0) ## openssl 2.2.0 2024-05-16 [2] RSPM (R 4.4.0) ## paletteer 1.6.0 2024-01-21 [1] RSPM (R 4.4.0) ## patchwork * 1.2.0 2024-01-08 [1] RSPM (R 4.4.0) ## pbkrtest 0.5.2 2023-01-19 [1] RSPM (R 4.4.0) ## pheatmap * 1.0.12 2019-01-04 [1] RSPM (R 4.4.0) ## pillar 1.9.0 2023-03-22 [2] RSPM (R 4.4.0) ## pkgconfig 2.0.3 2019-09-22 [2] RSPM (R 4.4.0) ## plotly 4.10.4 2024-01-13 [1] RSPM (R 4.4.0) ## plyr 1.8.9 2023-10-02 [1] RSPM (R 4.4.0) ## png 0.1-8 2022-11-29 [1] RSPM (R 4.4.0) ## Polychrome * 1.5.1 2022-05-03 [1] RSPM (R 4.4.0) ## postcards * 0.2.3 2022-01-07 [1] RSPM (R 4.4.0) ## promises 1.3.0 2024-04-05 [2] RSPM (R 4.4.0) ## ProtGenerics 1.36.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## purrr 1.0.2 2023-08-10 [2] RSPM (R 4.4.0) ## R.cache 0.16.0 2022-07-21 [1] RSPM (R 4.4.0) ## R.methodsS3 1.8.2 2022-06-13 [1] RSPM (R 4.4.0) ## R.oo 1.26.0 2024-01-24 [1] RSPM (R 4.4.0) ## R.utils 2.12.3 2023-11-18 [1] RSPM (R 4.4.0) ## R6 2.5.1 2021-08-19 [2] RSPM (R 4.4.0) ## rappdirs 0.3.3 2021-01-31 [2] RSPM (R 4.4.0) ## rbibutils 2.2.16 2023-10-25 [1] RSPM (R 4.4.0) ## RColorBrewer * 1.1-3 2022-04-03 [1] RSPM (R 4.4.0) ## Rcpp 1.0.12 2024-01-09 [2] RSPM (R 4.4.0) ## RCurl 1.98-1.14 2024-01-09 [1] RSPM (R 4.4.0) ## Rdpack 2.6 2023-11-08 [1] RSPM (R 4.4.0) ## recount3 * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## remaCor 0.0.18 2024-02-08 [1] RSPM (R 4.4.0) ## rematch2 2.1.2 2020-05-01 [2] RSPM (R 4.4.0) ## reshape2 1.4.4 2020-04-09 [1] RSPM (R 4.4.0) ## restfulr 0.0.15 2022-06-16 [1] RSPM (R 4.4.0) ## rhdf5 2.48.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## rhdf5filters 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## Rhdf5lib 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## RhpcBLASctl 0.23-42 2023-02-11 [1] RSPM (R 4.4.0) ## rintrojs 0.3.4 2024-01-11 [1] RSPM (R 4.4.0) ## rjson 0.2.21 2022-01-09 [1] RSPM (R 4.4.0) ## rlang * 1.1.4 2024-06-04 [2] RSPM (R 4.4.0) ## rmarkdown 2.27 2024-05-17 [2] RSPM (R 4.4.0) ## rpart 4.1.23 2023-12-05 [3] CRAN (R 4.4.0) ## rprojroot 2.0.4 2023-11-05 [2] RSPM (R 4.4.0) ## Rsamtools 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## RSQLite 2.3.7 2024-05-27 [1] RSPM (R 4.4.0) ## rstudioapi 0.16.0 2024-03-24 [2] RSPM (R 4.4.0) ## rsvd 1.0.5 2021-04-16 [1] RSPM (R 4.4.0) ## rtracklayer 1.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## S4Arrays 1.4.1 2024-05-20 [1] Bioconductor 3.19 (R 4.4.0) ## S4Vectors * 0.42.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## sass 0.4.9 2024-03-15 [2] RSPM (R 4.4.0) ## ScaledMatrix 1.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## scales 1.3.0 2023-11-28 [1] RSPM (R 4.4.0) ## scater * 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## scatterplot3d 0.3-44 2023-05-05 [1] RSPM (R 4.4.0) ## scRNAseq * 2.18.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) ## scuttle * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## sessioninfo * 1.2.2 2021-12-06 [2] RSPM (R 4.4.0) ## shape 1.4.6.1 2024-02-23 [1] RSPM (R 4.4.0) ## shiny 1.8.1.1 2024-04-02 [2] RSPM (R 4.4.0) ## shinyAce 0.4.2 2022-05-06 [1] RSPM (R 4.4.0) ## shinydashboard 0.7.2 2021-09-30 [1] RSPM (R 4.4.0) ## shinyjs 2.1.0 2021-12-23 [1] RSPM (R 4.4.0) ## shinyWidgets 0.8.6 2024-04-24 [1] RSPM (R 4.4.0) ## SingleCellExperiment * 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## smokingMouse * 0.99.91 2024-06-10 [1] Github (LieberInstitute/smokingMouse@96d8480) ## spam 2.10-0 2023-10-23 [1] RSPM (R 4.4.0) ## SparseArray 1.4.8 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) ## sparseMatrixStats 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## SpatialExperiment * 1.14.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## spatialLIBD * 1.16.2 2024-05-28 [1] Bioconductor 3.19 (R 4.4.0) ## statmod 1.5.0 2023-01-06 [1] RSPM (R 4.4.0) ## stringi 1.8.4 2024-05-06 [2] RSPM (R 4.4.0) ## stringr * 1.5.1 2023-11-14 [2] RSPM (R 4.4.0) ## styler 1.10.3 2024-04-07 [1] RSPM (R 4.4.0) ## SummarizedExperiment * 1.34.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## sys 3.4.2 2023-05-23 [2] RSPM (R 4.4.0) ## tibble 3.2.1 2023-03-20 [2] RSPM (R 4.4.0) ## tidyr 1.3.1 2024-01-24 [1] RSPM (R 4.4.0) ## tidyselect 1.2.1 2024-03-11 [1] RSPM (R 4.4.0) ## UCSC.utils 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## usethis * 2.2.3 2024-02-19 [2] RSPM (R 4.4.0) ## utf8 1.2.4 2023-10-22 [2] RSPM (R 4.4.0) ## variancePartition * 1.34.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## vctrs 0.6.5 2023-12-01 [2] RSPM (R 4.4.0) ## vipor 0.4.7 2023-12-18 [1] RSPM (R 4.4.0) ## viridis 0.6.5 2024-01-29 [1] RSPM (R 4.4.0) ## viridisLite 0.4.2 2023-05-02 [1] RSPM (R 4.4.0) ## withr 3.0.0 2024-01-16 [2] RSPM (R 4.4.0) ## xfun 0.44 2024-05-15 [2] RSPM (R 4.4.0) ## XML 3.99-0.16.1 2024-01-22 [1] RSPM (R 4.4.0) ## xtable 1.8-4 2019-04-21 [2] RSPM (R 4.4.0) ## XVector 0.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## yaml 2.3.8 2023-12-11 [2] RSPM (R 4.4.0) ## zlibbioc 1.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## ## [1] /__w/_temp/Library ## [2] /usr/local/lib/R/site-library ## [3] /usr/local/lib/R/library ## ## ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── proc.time() ## user system elapsed ## 13.821 1.070 14.742 System curl version: curl::curl_version() ## $version ## [1] "7.81.0" ## ## $ssl_version ## [1] "OpenSSL/3.0.2" ## ## $libz_version ## [1] "1.2.11" ## ## $libssh_version ## [1] "libssh/0.9.6/openssl/zlib" ## ## $libidn_version ## [1] "2.3.2" ## ## $host ## [1] "x86_64-pc-linux-gnu" ## ## $protocols ## [1] "dict" "file" "ftp" "ftps" "gopher" "gophers" "http" "https" "imap" "imaps" "ldap" ## [12] "ldaps" "mqtt" "pop3" "pop3s" "rtmp" "rtsp" "scp" "sftp" "smb" "smbs" "smtp" ## [23] "smtps" "telnet" "tftp" ## ## $ipv6 ## [1] TRUE ## ## $http2 ## [1] TRUE ## ## $idn ## [1] TRUE This interactive book was last updated at 2024-06-10 23:27:05.412221. License This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. "],["summarizedexperiment-overview.html", "1 SummarizedExperiment overview 1.1 Overview 1.2 Exercises 1.3 Solutions", " 1 SummarizedExperiment overview Instructor: Leo LIBD rstats club notes 1.1 Overview The SummarizedExperiment class is used to store experimental results in the form of matrixes. Objects of this class include observations (features) of the samples, as well as additional metadata. Usually, this type of object is automatically generated as the output of other software (ie. SPEAQeasy), but you can also build them. One of the main characteristics of SummarizedExperiment is that it allows you to handle you data in a “coordinated” way. For example, if you want to subset your data, with SummarizedExperiment you can do so without worrying about keeping your assays and metadata synched. 1.2 Exercises We are gonna use the sample data set from the airway library. library("SummarizedExperiment") library("airway") data(airway, package = "airway") se <- airway p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise 1: a) How many genes do we have in this object? And samples? b) How many samples come from donors treated (trt) with dexamethasone (dex)? ## For a) you could only print the summary of the object but since the idea is ## to understand how to explore the object find other function that gives ## you the answer. se #> class: RangedSummarizedExperiment #> dim: 63677 8 #> metadata(1): '' #> assays(1): counts #> rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492 ENSG00000273493 #> rowData names(10): gene_id gene_name ... seq_coord_system symbol #> colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521 #> colData names(9): SampleName cell ... Sample BioSample ## Same thing for b, you could just print the colData and count the samples, ## but this is not efficient when our data consists in hundreds of samples. ## Find the answer using other tools. colData(se) #> DataFrame with 8 rows and 9 columns #> SampleName cell dex albut Run avgLength Experiment Sample BioSample #> <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor> <factor> #> SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345 SRS508568 SAMN02422669 #> SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675 #> SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349 SRS508571 SAMN02422678 #> SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670 #> [ reached getOption("max.print") -- omitted 4 rows ] Exercise 2: Add another assay that has the log10 of your original counts ## In our object, if you look at the part that says assays, we can see that ## at the moment we only have one with the name "counts". se #> class: RangedSummarizedExperiment #> dim: 63677 8 #> metadata(1): '' #> assays(1): counts #> rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492 ENSG00000273493 #> rowData names(10): gene_id gene_name ... seq_coord_system symbol #> colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521 #> colData names(9): SampleName cell ... Sample BioSample ## To see the data that's stored in that assay you can do either one of the ## next commands. assay(se) #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] assays(se)$counts #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] ## Note that assay() does not support $ operator # assay(se)$counts ## We would have to do: assay(se, 1) #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] assay(se, "counts") #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] ## If you use assays() without specifying the element you want to see it ## shows you the length of the list and the name of each element. assays(se) #> List of length 1 #> names(1): counts ## To obtain a list of names as a vector you can use: assayNames(se) #> [1] "counts" ## Which can also be use to change the name of the assays assayNames(se)[1] <- "foo" assayNames(se) #> [1] "foo" assayNames(se)[1] <- "counts" Exercise 3: Explore the metadata and add a new column that has the library size of each sample. ## To calculate the library size use apply(assay(se), 2, sum) #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> 20637971 18809481 25348649 15163415 24448408 30818215 19126151 21164133 1.3 Solutions p.solution { background-color: #C093D6; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Solution 1: ## For a), dim() gives the desired answer dim(se) #> [1] 63677 8 ## For b), colData(se)[colData(se)$dex == "trt", ] #> DataFrame with 4 rows and 9 columns #> SampleName cell dex albut Run avgLength Experiment Sample BioSample #> <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor> <factor> #> SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675 #> SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670 #> SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354 SRS508576 SAMN02422673 #> SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358 SRS508580 SAMN02422677 colData(se)[se$dex == "trt", ] #> DataFrame with 4 rows and 9 columns #> SampleName cell dex albut Run avgLength Experiment Sample BioSample #> <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor> <factor> #> SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675 #> SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670 #> SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354 SRS508576 SAMN02422673 #> SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358 SRS508580 SAMN02422677 Solution 2: ## There are multiple ways to do it assay(se, "logcounts") <- log10(assay(se, "counts")) assays(se)$logcounts_v2 <- log10(assays(se)$counts) Solution 3: ## To add the library size we an use.. colData(se)$library_size <- apply(assay(se), 2, sum) names(colData(se)) #> [1] "SampleName" "cell" "dex" "albut" "Run" "avgLength" "Experiment" #> [8] "Sample" "BioSample" "library_size" "],["interactive-summarizedexperiment-visualizations.html", "2 Interactive SummarizedExperiment visualizations 2.1 Classes for iSEE 2.2 Getting Started with iSEE 2.3 Description of the user interface 2.4 Let’s practice! 2.5 Introduction to Advanced iSEE Features 2.6 References 2.7 Community", " 2 Interactive SummarizedExperiment visualizations Instructor: Melissa Mayén Quiroz How can you make plots from “SummarizedExperiment” objects without having to write any code? The answer is with “iSEE” http://bioconductor.org/packages/iSEE http://bioconductor.org/packages/release/bioc/vignettes/iSEE/inst/doc/basic.html iSEE is a Bioconductor package that provides an interactive Shiny-based graphical user interface for exploring data stored in SummarizedExperiment objects (Rue-Albrecht et al. 2018). 2.1 Classes for iSEE SummarizedExperiment (SE) and SingleCellExperiment (SCE) are classes in R. Classes serve as templates for creating objects that contain data and methods for manipulating those data. 2.1.1 SummarizedExperiment class Assay Data: The primary data matrix containing quantitative measurements, such as gene expression values or read counts. Rows represent features (e.g., genes, transcripts) and columns represent samples (e.g., experimental conditions, individuals). Row Metadata (rowData): Additional information about the features in the assay data. This can include annotations, identifiers, genomic coordinates, and other relevant information. Column Metadata (colData): Additional information about the samples in the assay data. This can include sample annotations, experimental conditions, treatment groups, and other relevant information. metadata: Additional information about the experiment. 2.1.2 SingleCellExperiment This object is specifically designed to store and analyze single-cell RNA sequencing (scRNA-seq) data. It extends the SummarizedExperiment class to include specialized features for single-cell data, such as cell identifiers, dimensionality reduction results, and methods for quality control and normalization. Assay Data: The primary data matrix containing gene expression values or other measurements. Rows represent genes and columns represent cells. colData (Column Metadata): Additional information about each cell, such as cell type, experimental condition, or any other relevant metadata. rowData (Row Metadata): Additional information about each gene, such as gene symbols, genomic coordinates, or functional annotations. reducedDims: Dimensionality reduction results, such as “principal component analysis” (PCA), “t-distributed stochastic neighbor embedding” (t-SNE), and “Uniform Manifold Approximation and Projection” (UMAP), used for visualizing and clustering cells. altExpNames and altExps: Names of alternative experiments (such as spike-in control genes used for normalization) and alternative experiment counts matrices. metadata: Additional metadata about the experiment. 2.1.3 SpatialExperiment This object extends the SingleCellExperiment class and is designed to store and analyze spatially-resolved transcriptomics data. Spatial transcriptomics combines gene expression data with spatial information, providing insights into the spatial organization of tissues. Assay Data: The primary data matrix containing gene expression values or other measurements. Rows represent genes and columns represent spatial spots or pixels. colData (Column Metadata): Additional information about each spatial spot or pixel, such as spatial coordinates, tissue section, or any other relevant metadata. rowData (Row Metadata): Additional information about each gene, such as gene symbols, genomic coordinates, or functional annotations. spatialCoords: A matrix or data frame containing the spatial coordinates (e.g., x and y coordinates) of each spot or pixel, which is crucial for spatial analyses and visualization. imgData: Links to image data associated with the spatial transcriptomics experiment, such as histology images or microscopy images, which provide the spatial context for the transcriptomics data. reducedDims: Dimensionality reduction results for visualizing and clustering spatial spots or pixels, similar to the SingleCellExperiment class. metadata: Additional metadata about the experiment. 2.2 Getting Started with iSEE Reference manual Adapted from The iSEE User’s Guide Installation (R version “4.4”). In this case, the package is already installed so we just need to load it. # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("iSEE") packageVersion("iSEE") #> [1] '2.16.0' library("iSEE") Documentation browseVignettes("iSEE") Use (simple launch): If you have a SummarizedExperiment object (se) or an instance of a subclass, like a SingleCellExperiment object (sce), you can launch an iSEE app by running: ## Launch iSEE for the se ("SummarizedExperiment" object) iSEE(se) ## Launch iSEE for the sce ("SingleCellExperiment" object) iSEE(sce) 2.3 Description of the user interface By default, the app starts with a dashboard that contains one panel or table of each type. By opening the collapsible panels named “Data parameters”, “Visual parameters”, and “Selection parameters” under each plot, we can control the content and appearance of each panel. Introductory tour: In the upper right corner there is a question mark icon ❓. Clicking it and then on the hand button you can have an introductory tour. During this tour, you will be taken through the different components of the iSEE user interface and learn the basic usage mechanisms by doing small actions guided by the tutorial: the highlighted elements will be responding to your actions, while the rest of the UI will be shaded. 2.3.1 Header The layout of the iSEE user interface uses the shinydashboard package. The dashboard header contains four dropdown menus. The “Organization” menu, which is identified by an icon displaying multiple windows “Export” dropdown menu, which is identified by a download icon The “Documentation” dropdown menu which is identified by a question mark icon ❓ The “Additional Information” dropdown menu which is identified by the information icon ℹ️ 2.3.1.1 Organization menu The “Organization” dropdown menu, dentified by an icon displaying multiple windows, includes: “Organize panels” button opens a modal window that contains: A selectize input to add, remove, and reorder panels in the main interface. Two inputs to control the width and height, respectively, of each panel selected above. The “Examine panel chart” feature, identified by a chain icon, allows you to visualize the relationships and point selections among your visible plot and table panels. Each panel is represented by a node, color-coded to match the app. (This functionality is particularly useful in sessions with many panels, helping you to see the structure of how panels send and receive data point selections). 2.3.1.2 Export dropdown menu The “Export” dropdown menu, identified by a download icon, includes: The “Download panel output” feature that allows you to download a zip folder containing the currently displayed panel content, including high-resolution figures and table contents as CSV files. The “Extract the R code” feature which provides a way to record the exact code that reproduces the current state of each plot. Clicking on this button opens a popup window with a text editor displaying the formatted code with syntax highlighting. You can copy this code, including initial lines and sessionInfo() commands, to your clipboard for inclusion in your analysis report or script. This code can then be further edited for publication. “Display panel settings” lets you export the code defining the current state of the panels in the interface. This is useful for pre-configuring an iSEE instance to start in the current state rather than with the default set of panels. 2.3.1.3 Documentation Menu The “Documentation” dropdown, accessible through the question mark icon ❓, includes: Interactive Tour: Launches a guided tour of iSEE, teaching basic usage interactively. Open Vignette: Displays the iSEE vignette, either locally or from the Bioconductor project site. 2.3.1.4 Additional Information Menu The “Additional Information” dropdown, accessible through the information icon ℹ️, includes: About this Session: Shows the output of the sessionInfo() function in a popup. About iSEE: Provides information on the development team, licensing, citation, and links to the GitHub repository for following development and contributing suggestions. 2.3.2 Panel types The main element in the body of iSEE is the combination of panels, generated (and optionally linked to one another) according to your actions. There are currently eight standard panel types that can be generated with iSEE: Reduced dimension plot Column data table Column data plot Feature assay plot Row data table Row data plot Sample assay plot Complex heatmap In addition, custom panel types can be defined. 2.3.3 Parameter sets For each standard plot panel, three different sets of parameters will be available in collapsible boxes: “Data parameters”, to control parameters specific to each type of plot. “Visual parameters”, to specify parameter s that will determine the aspect of the plot, in terms of coloring, point features, and more (e.g., legend placement, font size). “Selection parameters” to control the incoming point selection and link relationships to other plots. 2.3.4 Reduced dimension plots If a SingleCellExperiment object is supplied to the iSEE::iSEE() function, reduced dimension results are extracted from the reducedDim slot. Examples include low-dimensional embeddings from principal components analysis (PCA) or t-distributed stochastic neighbour embedding (t-SNE). These results are used to construct a two-dimensional Reduced dimension plot where each point is a sample, to facilitate efficient exploration of high-dimensional datasets. The “Data parameters” control the reducedDim slot to be displayed, as well as the two dimensions to plot against each other. Note that this built in panel does not compute reduced dimension embeddings; they must be precomputed and available in the object provided to the iSEE() function. Nevertheless, custom panels - such as the iSEE DynamicReducedDimensionPlot can be developed and used to enable such features. 2.3.5 Column data plots A Column data plot visualizes sample metadata contained in column metadata. Different fields can be used for the x- and y-axes by selecting appropriate values in the “Data parameters” box. This plot can assume various forms, depending on the nature of the data on the x- and y-axes: If the y-axis is continuous and the x-axis is categorical, violin plots are generated (grouped by the x-axis factor). If the y-axis is categorical and the x-axis is continuous, horizontal violin plots are generated (grouped by the y-axis factor). If both axes are continuous, a scatter plot is generated. This enables the use of contours that are overlaid on top of the plot, check the “Other” box to see the available options. If both axes are categorical, a plot of squares (Hinton plot) is generated where the area of each square is proportional to the number of samples within each combination of factor levels. 2.3.6 Feature assay plots A Feature assay plot visualizes the assayed values (e.g., gene expression) for a particular feature (e.g., gene) across the samples on the y-axis. This usually results in a (grouped) violin plot, if the x-axis is set to “None” or a categorical variable; or a scatter plot, if the x-axis is another continuous variable. Gene selection for the y-axis can be achieved by using a linked row data table in another panel. Clicking on a row in the table automatically changes the assayed values plotted on the y-axis. Alternatively, the row name can be directly entered as text that corresponds to an entry of rownames(se). (This is not effective if se does not contain row names.) 2.3.7 Row data plots A Row data plot allows the visualization of information stored in the rowData slot of a “SummarizedExperiment” object. Its behavior mirrors the implementation for the Column data plot, and correspondingly this plot can assume various forms depending on whether the data are categorical or continuous. 2.3.8 Sample assay plots A Sample assay plot visualizes the assayed values (e.g., gene expression) for a particular sample (e.g., cell) across the features on the y-axis. This usually results in a (grouped) violin plot, if the x-axis is set to “None” or a categorical variable (e.g., gene biotype); or a scatter plot, if the x-axis is another continuous variable. Notably, the x-axis covariate can also be set to: A discrete row data covariates (e.g., gene biotype), to stratify the distribution of assayed values A continuous row data covariate (e.g., count of cells expressing each gene) Another sample, to visualize and compare the assayed values in any two samples. 2.3.9 Row data tables A Row data table contains the values of the rowData slot. If none are available, a column named Present is added and set to TRUE for all features, to avoid issues with DT::datatable() and an empty DataFrame. Typically, these tables are used to link to other plots to determine the features to use for plotting or coloring. 2.3.10 Column data tables A Column data table contains the values of the colData slot. Its behavior mirrors the implementation for the Row data table. Correspondingly, if none are available, a column named Present is added and set to TRUE for all samples. Typically, these tables are used to link to other plots to determine the samples to use for plotting or coloring. 2.3.11 Heat maps Heat map panels provide a compact overview of the data for multiple features in the form of color-coded matrices. These correspond to the assays stored in the SCE/SE object, where features (e.g., genes) are the rows and samples are the columns. User can select features (rows) to display from the selectize widget (which supports autocompletion), or also via other panels, like row data plots or row data tables. In addition, users can rapidly import custom lists of feature names using a modal popup that provides an Ace editor where they can directly type of paste feature names, and a file upload button that accepts text files containing one feature name per line. Users should remember to click the “Apply” button before closing the modal, to update the heat map with the new list of features. The “Suggest feature order” button clusters the rows, and also rearranges the elements in the selectize according to the clustering. It is also possible to choose which assay type is displayed (\"logcounts\" being the default choice, if available). Samples in the heat map can also be annotated, simply by selecting relevant column metadata. A zooming functionality is also available, restricted to the y-axis (i.e., allowing closer inspection on the individual features included). 2.3.12 Description of iSEE functionality 2.3.12.1 Coloring plots by sample attributes 2.3.12.1.1 Column-based plots Column-based plots are: reduced dimension feature assay column data plots Where each data point represents a sample. Here, data points can be colored in different ways: The default is no color scheme (“None” in the radio button). Any column of colData(se) can be used. The plot automatically adjusts the scale to use based on whether the chosen column is continuous or categorical. The assay values of a particular feature in each sample can be used. The feature can be chosen either via a linked row table or selectize input (as described for the Feature assay plot panel). Users can also specify the assays from which values are extracted. The identity of a particular sample can be used, which will be highlighted on the plot in a user-specified color. The sample can be chosen either via a linked column table or via a selectize input. 2.3.12.1.2 Row-based plots For row-based plots (i.e., the sample assay and row data plots), each data point represents a feature. Like the column-based plots, data points can be colored by: “None”, yielding data points of fixed color. Any column of rowData(se). The identity of a particular feature, which is highlighted in the user-specified color. Assay values for a particular sample. 2.3.12.2 Controlling point aesthetics Data points can be set to different shapes according to categorical factors in colData(se) (for column-based plots) or rowData(se) (for row-based plots). This is achieved by checking the “Shape” box to reveal the shape-setting options. The size and opacity of the data points can be modified via the options available by checking the “Point” box. This may be useful for aesthetically pleasing visualizations when the number of points is very large or small. 2.3.12.3 Faceting Each point-based plot can be split into multiple facets using the options in the “Facet” checkbox. Users can facet by row and/or column, using categorical factors in colData(se) (for column-based plots) or rowData(se) (for row-based plots). This provides a convenient way to stratify points in a single plot by multiple factors of interest. Note that point selection can only occur within a single facet at a time; points cannot be selected across facets. 2.3.12.4 Zooming in and out Zooming in is possible by first selecting a region of interest in a plot using the brush (drag and select); double-clicking on the brushed area then zooms into the selected area. To zoom out to the original plot, simply double-click at any location in the plot. 2.4 Let’s practice! 2.4.1 Setting up the data We’ll download a SingleCellExperiment object, which is similar to SummarizedExperiment as it extends it. http://bioconductor.org/packages/SingleCellExperiment http://bioconductor.org/packages/spatialLIBD https://doi.org/10.1038/s41593-020-00787-0 https://osca.bioconductor.org/ https://www.nature.com/articles/s41592-019-0654-x Figures 2 and 3 ## Lets get some data using spatialLIBD sce_layer <- spatialLIBD::fetch_data("sce_layer") #> adding rname 'https://www.dropbox.com/s/bg8xwysh2vnjwvg/Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata?dl=1' #> 2024-06-10 23:27:14.700913 loading file /github/home/.cache/R/BiocFileCache/3993f119bd3_Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata%3Fdl%3D1 sce_layer #> class: SingleCellExperiment #> dim: 22331 76 #> metadata(0): #> assays(2): counts logcounts #> rownames(22331): ENSG00000243485 ENSG00000238009 ... ENSG00000278384 ENSG00000271254 #> rowData names(10): source type ... is_top_hvg is_top_hvg_sce_layer #> colnames(76): 151507_Layer1 151507_Layer2 ... 151676_Layer6 151676_WM #> colData names(13): sample_name layer_guess ... layer_guess_reordered_short spatialLIBD #> reducedDimNames(6): PCA TSNE_perplexity5 ... UMAP_neighbors15 PCAsub #> mainExpName: NULL #> altExpNames(0): ## We can check how big the object is with lobstr lobstr::obj_size(sce_layer) #> 33.99 MB NOTE: if you run into this error: Error in `BiocFileCache::bfcrpath()`: ! not all 'rnames' found or unique. Backtrace: 1. spatialLIBD::fetch_data("sce_layer") 3. BiocFileCache::bfcrpath(bfc, url) check the output of curl::curl_version()$version #> [1] "7.81.0" If it’s version 8.6.0, you likely need to upgrade to version 8.8.0. For macOS users, you can do this via Homebrew with ## Install homebrew from https://brew.sh/ brew install curl then install curl from source with: Sys.setenv(PKG_CONFIG_PATH = "/opt/homebrew/opt/curl/lib/pkgconfig") install.packages("curl", type = "source") For all the gory details, check https://github.com/curl/curl/issues/13725, https://github.com/Bioconductor/BiocFileCache/issues/48, and related issues. As a workaround, you could also run this: tmp_sce_layer <- tempfile("sce_layer.RData") download.file( "https://www.dropbox.com/s/bg8xwysh2vnjwvg/Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata?dl=1", tmp_sce_layer, mode = "wb" ) load(tmp_sce_layer, verbose = TRUE) #> Loading objects: #> sce_layer sce_layer #> class: SingleCellExperiment #> dim: 22331 76 #> metadata(0): #> assays(2): counts logcounts #> rownames(22331): ENSG00000243485 ENSG00000238009 ... ENSG00000278384 ENSG00000271254 #> rowData names(10): source type ... is_top_hvg is_top_hvg_sce_layer #> colnames(76): 151507_Layer1 151507_Layer2 ... 151676_Layer6 151676_WM #> colData names(12): sample_name layer_guess ... layer_guess_reordered layer_guess_reordered_short #> reducedDimNames(6): PCA TSNE_perplexity5 ... UMAP_neighbors15 PCAsub #> mainExpName: NULL #> altExpNames(0): 2.4.2 Explore the Data Now we can deploy iSEE() to explore the data. ## Load library library("iSEE") ## Deploy iSEE(sce_layer) p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Question 1: Which panel Type is displaying the following plot? Exercise 1: Recreate the following plot. Question 2: What is different between this 2 plots? Exercise 2: Recreate the following plot. Question 3: What is different between this 2 plots? Exercise 3: Recreate the following plot Ensembl IDs: ENSG00000177757 ENSG00000237491 ENSG00000238009 ENSG00000243485 Exercise 4: Recreate the following plot. What would you change from the last one? Ensembl IDs: ENSG00000177757 ENSG00000237491 ENSG00000238009 ENSG00000243485 Exercise 5: Recreate the following plot. What would you change from the last one? Ensembl IDs: ENSG00000177757 ENSG00000237491 ENSG00000238009 ENSG00000243485 Exercise 6: Download only the last plot (Final HeatMap) Exercise 7: Extract the R code only for the last plot (Final HeatMap) 2.5 Introduction to Advanced iSEE Features Adapted from the GitHub Issue: https://github.com/iSEE/iSEE/issues/650 Beyond its basic functionalities, iSEE offers advanced features that allow users to perform complex data manipulations interactively. This includes the ability to subset and filter cells based on gene expression criteria. To begin with, we will load the necessary libraries and dataset. In this case we will be using ReprocessedAllenData from the scRNAseq package, a dataset of 379 mouse brain cells from Tasic et al. (2016). After loading the dataset, we normalize the counts and perform a PCA (Principal Component Analysis) to prepare the data for visualization. library("scRNAseq") library("scater") library("iSEE") # Load the dataset sce <- ReprocessedAllenData(assays = "tophat_counts") # Normalize counts and perform PCA sce <- logNormCounts(sce, exprs_values = "tophat_counts") sce <- runPCA(sce, ncomponents = 4) 2.5.1 Selecting Cells Based on a Single Gene Expression To select cells based on the expression of a single gene using iSEE, we need to create an initial list of panels that will be displayed when we launch iSEE. The first panel in our list is a “FeatureAssayPlot”, which will show the expression levels of the gene “Serpine2”. By visualizing this plot, we can interactively select cells that express “Serpine2”. To complement this, we add a “ReducedDimensionPlot” to our panel list. This plot will visualize the PCA and highlight the cells that we selected based on “Serpine2” expression. The linkage between these two panels allows us to see how the selected cells are distributed in the reduced dimensional space (PCA). ## Initial settings for a single gene expression initial_single <- list( FeatureAssayPlot(Assay = "logcounts", YAxisFeatureName = "Serpine2"), ReducedDimensionPlot(Type = "PCA", ColorBy = "Column selection", ColumnSelectionSource = "FeatureAssayPlot1") ) ## Launch iSEE with the initial settings if (interactive()) { iSEE(sce, initial = initial_single) } 2.5.2 Using a Single Plot for Two Gene Co-Expression To select cells based on the expression of two gene, we can use a single “FeatureAssayPlot” panel. In this setup, one gene is plotted on the x-axis and the other gene on the y-axis. This method allows us to directly visualize and select cells that express both genes simultaneously. By adding a “ReducedDimensionPlot” to our initial panel list, we can again see how these selected cells are distributed in the PCA plot. This approach is simpler when dealing with only two genes and provides an intuitive way to explore co-expression patterns. ## Initial settings for 2 genes expression on the same "FeatureAssayPlot" initial_combined <- list( FeatureAssayPlot(Assay = "logcounts", XAxis = "Feature name", XAxisFeatureName = "Serpine2", YAxisFeatureName = "Bcl6"), ReducedDimensionPlot(Type = "PCA", ColorBy = "Column selection", ColumnSelectionSource = "FeatureAssayPlot1") ) ## Launch iSEE with the initial settings if (interactive()) { iSEE(sce, initial = initial_combined) } 2.5.3 Selecting Cells Based on the Co-Expression of Two or more Genes In situations where we want to select cells based on the expression of two or more genes, we need to chain multiple “FeatureAssayPlot” panels together. For instance, if we are interested in cells that express both “Serpine2” and “Bcl6”, we start by creating a “FeatureAssayPlot” for “Serpine2”. Then, we add another “FeatureAssayPlot” for “Bcl6”, but this time we specify that the selection source for this plot is the “FeatureAssayPlot” for “Serpine2”. This setup ensures that only cells that were selected in the first plot (based on “Serpine2”) are displayed in the second plot (for” Bcl6”). Finally, we include a ReducedDimensionPlot to visualize the PCA, highlighting the cells that meet both criteria. This chained selection process allows for more refined filtering based on multiple gene expressions. ## Initial settings chainning multiple "FeatureAssayPlot" initial_double <- list( FeatureAssayPlot(Assay = "logcounts", YAxisFeatureName = "Serpine2"), FeatureAssayPlot(Assay = "logcounts", YAxisFeatureName = "Bcl6", ColumnSelectionSource = "FeatureAssayPlot1", ColumnSelectionRestrict = TRUE), ReducedDimensionPlot(Type = "PCA", ColorBy = "Column selection", ColumnSelectionSource = "FeatureAssayPlot2") ) ## Launch iSEE with the initial settings if (interactive()) { iSEE(sce, initial = initial_double) } 2.6 References https://www.bioconductor.org/packages/release/bioc/html/iSEE.html https://github.com/iSEE/iSEE https://shiny.posit.co/r/gallery/life-sciences/isee/ https://bioconductor.org/packages/release/bioc/vignettes/iSEE/inst/doc/basic.html https://github.com/iSEE/iSEE/issues/650 2.7 Community iSEE authors: Kévin Rue-Albrecht https://twitter.com/KevinRUE67 Federico Marini https://twitter.com/FedeBioinfo Charlotte Soneson https://bsky.app/profile/csoneson.bsky.social Aaron Lun https://twitter.com/realAaronLun "],["recount3-introduction.html", "3 recount3 introduction 3.1 recount projects 3.2 Using recount3 3.3 Exercise 3.4 Community", " 3 recount3 introduction Instructor: Leo Don’t let useful data go to waste by Franziska Denk https://doi.org/10.1038/543007a 3.1 recount projects ReCount: data from 20 studies http://bowtie-bio.sourceforge.net/recount/index.shtml Paper from 2011 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-449 recount: over 70k human bulk RNA-seq samples uniformly processed https://jhubiostatistics.shinyapps.io/recount/ pkgdown documentation website: http://leekgroup.github.io/recount/ Bioconductor documentation website: http://bioconductor.org/packages/recount Main paper (2017) http://www.nature.com/nbt/journal/v35/n4/full/nbt.3838.html Paper that explains why the counts are different from the usual ones https://f1000research.com/articles/6-1558/v1 Example analyses we did and provided as a companion website for the 2017 paper http://leekgroup.github.io/recount-analyses/ recount3: over 700k bulk RNA-seq samples from human and mouse http://rna.recount.bio/ pkgdown documentation website: http://research.libd.org/recount3/ Bioconductor documentation website: http://bioconductor.org/packages/recount3 Pre-print: May 2021 https://doi.org/10.1101/2021.05.21.445138 Paper: November 2021 https://doi.org/10.1186/s13059-021-02533-6 These projects help such that anyone, particularly those without access to a high performance computing (HPC) system (aka a compute cluster), can access these datasets. It’s like democratizing access to the gene expression data ^^. 3.2 Using recount3 Check the original documentation here and here. Let’s first load recount3 which will load all the required dependencies including SummarizedExperiment. ## Load recount3 R package library("recount3") Next we need to identify a study of interest as well as choose whether we want the data at the gene, exon, or some other feature level. Once we have identified our study of interest, we can download the files and build a SummarizedExperiment object using recount3::create_rse() as we’ll show next. create_rse() has arguments through which we can control what annotation we want to use (they are organism-dependent). ## Lets download all the available projects human_projects <- available_projects() #> 2024-06-10 23:27:31.744946 caching file sra.recount_project.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/sra.recount_project.MD.gz' #> 2024-06-10 23:27:33.913812 caching file gtex.recount_project.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/gtex/metadata/gtex.recount_project.MD.gz' #> 2024-06-10 23:27:35.877246 caching file tcga.recount_project.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/tcga/metadata/tcga.recount_project.MD.gz' ## Find your project of interest. Here we'll use ## SRP009615 as an example proj_info <- subset( human_projects, project == "SRP009615" & project_type == "data_sources" ) ## Build a RangedSummarizedExperiment (RSE) object ## with the information at the gene level rse_gene_SRP009615 <- create_rse(proj_info) #> 2024-06-10 23:27:39.53866 downloading and reading the metadata. #> 2024-06-10 23:27:40.23701 caching file sra.sra.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.sra.SRP009615.MD.gz' #> 2024-06-10 23:27:42.632536 caching file sra.recount_project.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_project.SRP009615.MD.gz' #> 2024-06-10 23:27:44.828246 caching file sra.recount_qc.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_qc.SRP009615.MD.gz' #> 2024-06-10 23:27:46.32683 caching file sra.recount_seq_qc.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_seq_qc.SRP009615.MD.gz' #> 2024-06-10 23:27:47.879167 caching file sra.recount_pred.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_pred.SRP009615.MD.gz' #> 2024-06-10 23:27:49.07915 downloading and reading the feature information. #> 2024-06-10 23:27:49.605369 caching file human.gene_sums.G026.gtf.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/annotations/gene_sums/human.gene_sums.G026.gtf.gz' #> 2024-06-10 23:27:51.119758 downloading and reading the counts: 12 samples across 63856 features. #> 2024-06-10 23:27:51.705248 caching file sra.gene_sums.SRP009615.G026.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/gene_sums/15/SRP009615/sra.gene_sums.SRP009615.G026.gz' #> 2024-06-10 23:27:53.086475 constructing the RangedSummarizedExperiment (rse) object. ## Explore the resulting object rse_gene_SRP009615 #> class: RangedSummarizedExperiment #> dim: 63856 12 #> metadata(8): time_created recount3_version ... annotation recount3_url #> assays(1): raw_counts #> rownames(63856): ENSG00000278704.1 ENSG00000277400.1 ... ENSG00000182484.15_PAR_Y ENSG00000227159.8_PAR_Y #> rowData names(10): source type ... havana_gene tag #> colnames(12): SRR387777 SRR387778 ... SRR389077 SRR389078 #> colData names(175): rail_id external_id ... recount_pred.curated.cell_line BigWigURL ## How large is it? lobstr::obj_size(rse_gene_SRP009615) #> 24.81 MB We can also interactively choose our study of interest using the following code or through the recount3 study explorer. ## Explore available human projects interactively proj_info_interactive <- interactiveDisplayBase::display(human_projects) ## Choose only 1 row in the table, then click on "send". ## Lets double check that you indeed selected only 1 row in the table stopifnot(nrow(proj_info_interactive) == 1) ## Now we can build the RSE object rse_gene_interactive <- create_rse(proj_info_interactive) Now that we have the data, we can use recount3::transform_counts() or recount3::compute_read_counts() to convert the raw counts into a format expected by downstream tools. For more details, check the recountWorkflow paper. ## We'll compute read counts, which is what most downstream software ## uses. ## For other types of transformations such as RPKM and TPM, use ## transform_counts(). assay(rse_gene_SRP009615, "counts") <- compute_read_counts(rse_gene_SRP009615) ## Lets make it easier to use the information available for this study ## that was provided by the original authors of the study. rse_gene_SRP009615 <- expand_sra_attributes(rse_gene_SRP009615) colData(rse_gene_SRP009615)[ , grepl("^sra_attribute", colnames(colData(rse_gene_SRP009615))) ] #> DataFrame with 12 rows and 4 columns #> sra_attribute.cells sra_attribute.shRNA_expression sra_attribute.source_name sra_attribute.treatment #> <character> <character> <character> <character> #> SRR387777 K562 no SL2933 Puromycin #> SRR387778 K562 yes, targeting SRF SL2934 Puromycin, doxycycline #> SRR387779 K562 no SL5265 Puromycin #> SRR387780 K562 yes targeting SRF SL3141 Puromycin, doxycycline #> SRR389079 K562 no shRNA expression SL6485 Puromycin #> ... ... ... ... ... #> SRR389082 K562 expressing shRNA tar.. SL2592 Puromycin, doxycycline #> SRR389083 K562 no shRNA expression SL4337 Puromycin #> SRR389084 K562 expressing shRNA tar.. SL4326 Puromycin, doxycycline #> SRR389077 K562 no shRNA expression SL1584 Puromycin #> SRR389078 K562 expressing shRNA tar.. SL1583 Puromycin, doxycycline We are now ready to use other bulk RNA-seq data analysis software tools. 3.3 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise 1: Use iSEE to reproduce the following image Hints: Use dynamic feature selection Use information from columns (samples) for the X axis Use information from columns (samples) for the colors (optional) Create your free account at https://www.shinyapps.io/ and share your iSEE app with the world. Regrettably iSEE::iSEE() will need more than the default free 1 GB RAM option available from https://www.shinyapps.io/. Real examples used on a paper: https://github.com/LieberInstitute/10xPilot_snRNAseq-human#explore-the-data-interactively. Example from another course: https://libd.shinyapps.io/SRP009615/. It was created with https://github.com/lcolladotor/rnaseq_2023_notas_en_vivo/blob/main/app.R. 3.4 Community recount2 and 3 authors on Twitter: https://twitter.com/chrisnwilks https://twitter.com/BenLangmead https://twitter.com/KasperDHansen https://bsky.app/profile/nav.bsky.social https://twitter.com/Shannon_E_Ellis https://twitter.com/jtleek More about the different types of counts: If I'm using recount2 data for a differential analysis in DEseq2, should I be using the original counts, or the scaled counts?@mikelove @lcolladotor #rstats #Bioconductor — Dr. Robert M Flight, PhD (@rmflight) January 29, 2021 Tweets from the community From a student in the LCG-UNAM 2021 course: @lcolladotor Earlier I was looking for some data to analyze in recount, they have so much, I seriously can't decide what to use! https://t.co/fIJwXq46TzThanks for such an useful package!@chrisnwilks @BenLangmead @KasperDHansen @AbhiNellore @Shannon_E_Ellis @jtleek — Axel Zagal Norman (@NormanZagal) February 25, 2021 Exploring the possibility of using recount3 data for an analysis (January 2022): I have found a novel exon expressed in a cancer sample. I would like to search TCGA/SRA to identify other samples with the same/similar exon. It will be rare. Can I use Recount3, megadepth for this? @jtleek @lcolladotor @BenLangmead — Alicia Oshlack (@AliciaOshlack) January 5, 2022 Others discussing meta analyses publicly on Twitter: Thinking on this a bit it is strange how few people are doing “medium-sized” meta analyses of transcriptiomics. One on end you have @BenLangmead @lcolladotor reprocessing (with a touch of analysis) most of SRA. And you see papers pulling an dataset or two to corroborate. — David McGaughey (@David_McGaughey) February 1, 2022 That might be a gin&tonic in my hand, but it still holds true that #recount3 is a wonderful resource and super useful in our annotation efforts! Great to meet you @lcolladotor!! https://t.co/cSCZAajhrY — GencodeGenes (@GencodeGenes) May 11, 2024 "],["differential-gene-expression-analysis-overview.html", "4 Differential Gene Expression analysis overview 4.1 Preliminary steps 4.2 Differential Gene Expression 4.3 Downstream analyses References", " 4 Differential Gene Expression analysis overview Instructor: Daianna González Padilla Differential Gene Expression (DGE) analyses are common statistical analyses of gene expression data that aim to discover genes significantly altered in their expression levels between experimental groups, which can be given by a condition, treatment, experimental procedure/exposure, diagnostic, time points, by biological origins (e.g. differences in sex, tissue, age, species), and even by different technical methodologies. These genes are known as Differentially Expressed Genes (DEGs) and can be either up- or down-regulated if their expression is greater or less in one group with respect to the other(s), respectively. Diverse methods exist to perform DGE and multiple downstream analyses can be applied on DEGs, but a series of non-skippable preliminary steps exists which are necessary to correctly perform previous to any statistical testing. Below a classic workflow for DGE is depicted. It takes as input the gene expression matrix with raw read counts for genes (as rows) across all samples (as columns). Among the preliminary steps, there is an initial data processing step encompassing count normalization and filtering of lowly-expressed genes. Secondly, Exploratory Data Analysis (EDA) involves assessment of Quality Control (QC) metrics of the samples and filtering of poor-quality ones, as well as an examination of the gene expression profiles between sample groups, potential detection of additional atypical samples to remove, and the exploration of the correlations between sample-level variables and their contributions in the expression variance of each gene to guide the covariate selection for DGE models. Figure 1: Summary of the analyses for differential expression. 1. RNA-seq data processing: raw counts are normalized and log-scaled (lognorm counts) and the lowly-expressed genes are filtered out. 2. Exploratory Data Analysis: quality metrics of the samples are compared across groups, the poor-quality samples are filtered and both sample-level and gene-level effects of sample variables are explored to identify those that are main drivers of gene expression variation to include in the models for DGE. 3. Differential Expression Analysis: under the limma-voom pipeline the expression of each gene is linearly modeled by the selected variables in the previous step; after fitting the model gene-wise log2-fold changes (log2FC) and p-values are obtained for the variable of interest and other statistics of differential expression are also computed and compared. Here DEGs are determined based on the significance threshold (controlling for the False Discovery Rate or FDR). 4. Functional Enrichment Analysis: an overrepresentation analysis (ORA) is performed to find statistically significant associations between our groups of DEGs and gene sets annotated in GO terms and KEGG pathways; here we identify biological processes, cellular functions and components, and pathways potentially affected or involved in the experimental condition under study. 5. DE visualization: heatmaps are created to visually contrast gene expression levels of DEGs in the experimental groups. Abbreviations: CPM: counts per million; QC: quality control; PC: principal component; DEG(s): differentially expressed gene(s); Ctrl: control; Expt: experimental; GO: Gene Ontology; KEGG: Kyoto Encyclopedia of Genes and Genomes. 4.1 Preliminary steps Evident computational steps right after sequencing involve raw sequencing reads Quality Control (QC) analysis and read alignment to a reference genome for the subsequent gene expression quantification, generating the input for DGE. Comprehensive pipelines have been developed for these purposes, such as the RNA-seq processing pipeline SPEAQeasy (Eagles, N.J. et al. 2021) that provides a flexible, user-friendly, and reproducible pipeline to perform all such analyses through the implementation of a single workflow, liberating from the need of performing each step individually. p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 15px; font-family: sans-serif; } 👉🏼 More details of this pipeline are provided in the original manuscript, the documentation website, and in other created resources. 4.1.1 RNA-seq data processing Once the gene expression matrix has been generated we can proceed to process the read counts. Raw counts are typically zero-enriched and not normally-distributed, opposite to what is required for the application of several statistical methods. Furthermore, raw gene expression values may reflect protocol-specific biases and biological factors other than the one of interest. Therefore raw gene expression counts must be normalized and lowly-expressed genes filtered out. 4.1.1.1 Data normalization Differences between samples such as library sizes (sequencing depths), and RNA composition, as well as different gene lengths and GC contents make raw gene expression data not comparable between samples. Several normalization strategies can be applied to surpass such differences; commonly counts-per-million (cpm) are used. In addition to count normalization, a log-transformation is required to make cpm follow an approximately normal distribution. 4.1.1.2 Gene filtering Often the expression profiling platform (microarray or RNA-seq) includes genes that do not appear to be expressed to a worthwhile degree in any or most of the samples. This might occur, for instance, in cases where genes are not expressed in any of the cell types being experimentally profiled [1]. We want to remove those genes prior to posterior analyses, which is justified on both biological and statistical grounds [2]: Biologically, it is considered that a gene must have a minimal expression level to be translated into a protein or to be of biological importance. Statistically, lowly-expressed genes are unlikely to be detected as DE because low counts don’t provide the required statistical evidence to assess differential expression. Different approaches exist for this step, ranging from the simple definition of a gene expression cutoff to those taking into account not only global gene expression but also the number of samples in which they have a minimum expression level. 👉🏼 More details about normalization and filtering can be consulted in the course material of 2023 (Statistical Analysis of Genome Scale Data 2023: Data preparation). Figure 2: RNA-seq data processing steps. 1. Count log-normalization: distribution of read counts before and after normalization and log-transformation into \\(log_2(cpm+0.5)\\) using calcNormFactors() and cpm() of edgeR. 2. Gene filtering: distribution of \\(log_2(cpm+0.5)\\) before and after filtering lowly-expressed genes; note the elimination of counts that were originally zeros. 4.1.2 Exploratory Data Analysis The Exploratory Data Analysis (EDA) is a primordial step in which, as the name refers, we explore relevant aspects of the RNA-seq data. In this process we basically create tons of plots, charts and graphs to visualize the data, assess their quality and inspect their variability. This clearly exposes low-quality samples and relationships and contributions in gene expression variance of sample-level variables. This allows to draw valuable information from our data that could impact posterior analyses, including DGE. Thus EDA guides filtering steps, the execution of additional analyses, the selection of covariates for DGE models and of statistical tests based on data features, and could also aid in more accurate interpretations of the results. Although here we describe EDA as being comprised by QCA, dimensionality reduction to explore sample-level effects, and variance partition analysis to explore gene-level effects, EDA is not a well defined process that can be followed by concrete instructions or steps. The analyses you run and what you plot depends on the particular questions you’re trying to answer, what you would like to know about your data and of course, it completely depends on the specific characteristics of your own dataset. 4.1.2.1 Quality Control Analysis (QCA) First, the quality metrics of the samples regarding read and RNA contents, and read mapping rates have to be compared to (Figure 3: step 1): Identify punctual samples or groups of samples of poor quality that may have arisen by technical causes during experimental steps. Evaluate if samples from the groups of interest for DGE (diagnostic, treatment, etc.) differ in their quality metrics as these can represent confounding factors for differential expression. Detect high biological variability to subsequently support data partition to perform subanalyses from the data. Further, we are also interested in investigating trends and relationships between sample variables to unveil underlying technical and biological aspects of the observed data (Figure 3: step 2). After having identified poor-quality samples, we have to remove them to not include the unreliable expression data they provide in downstream analyses. Cutoffs can be defined for specific QC metrics to decide which samples to keep; this however, is not strongly recommended as no consolidated references exist to define such cutoffs and therefore rather represent arbitrary values. Other approaches include identifying outlier QC metrics (Figure 3: step 3), but again, caution must be taken as outlier definition is also arbitrary and we could be discarding good-quality samples. Figure 3: Quality Control Analysis steps. 1. Evaluate QC metrics for groups of samples: sample QC metrics such as the fraction of reads that mapped to the mitochondrial chromosome (mitoRate) and to the reference genome (overallMapRate) are compared between sample groups given by the variable of interest (Group in this example), technical variables (e.g. plate for sample library preparation), and biological variables (e.g. Age). 2. Examine relationships between sample variables: pairs of QC metrics are compared; here mitoRate and the fraction of reads assigned to rRNA genes (rRNA_rate), as well as the library size (sum) and the number of expressed genes (detected) are plotted to explore the relationships they present with each other and with other sample metadata variables. (Group and Age). 3. QC-based sample filtering: outlier QC metrics (red) are detected based on +/- 3 median-absolute-deviations (MADs) away (dotted lines) from the median (solid line). 👉🏼 See more details about QCA in Statistical Analysis of Genome Scale Data 2023: Quality Control Analysis. 4.1.2.2 Exploration of sample-level effects Sample gene expression profiles can be analyzed and compared after dimensionality reduction procedures such as Principal Component Analysis (PCA) and Multidimensional-Scaling (MDS). These analyses are useful to potentially detect samples with outlier transcriptomic profiles to further remove and to identify sample variables driving gene expression variations (Figure 4). Figure 4: Exploration of sample-level effects through PCA 1. Detection of atypical samples (manual PCA-based sample filtering): PCx vs PCy plots can expose outlier samples that appear segregated from the rest (purple-squared sample) or samples of a particular group (Sex: F or M) closer to samples from the other group (blue-squared sample). These should be further examined to evalute if they can be kept or must be discarded. In this case, after removing them, PC2 that explains a higher % of variance in gene expression, separates samples by sex. 2. Identification of drivers of sample gene expression variation: reducing the dimensionality of our data enables to recognize sample variables explaining differences in the gene expression of the samples (Age), ascertain technical variables and batch effects are not impacting on the transcriptome (plate), and inquire to what extent our variable of interest is contributing to changes in gene expression (Group). 4.1.2.3 Model building: covariate selection for limma-voom DGE methods fitting linear models to gene expression data to assess if a covariate impacts significantly on the expression of a gene, require the selection of sample-level variables to model transcriptomic data. If very few variables are present, normally they are all included in the model but that’s not often the case with RNA-seq and it doesn’t represent a well founded strategy. Usually, multiple technical and biological variables are implicated in the experiments and sample QC metrics can affect the gene expression levels, even after count normalization, whereas other variables are redundant and/or minimally informative. Therefore, we’d like to identify an optimal set of variables to adjust gene expression for, in addition to the covariate of interest. We have already introduced one first approximation to that with PCA as this analysis allows us to identify variables explaining high percentages of gene expression variance between samples. In Chapter 7 we will review how correlation and variance partition analyses at the gene level can help us determine a suitable set of highly explanatory variables. 4.2 Differential Gene Expression Different mathematical and statistical approaches exist to compare gene expression between two or more conditions. In Chapter 5 we’ll briefly introduce methods based on the negative binomial distribution and address how to perform DGE under the empirical Bayes limma-voom framework, distinguishing how it operates, its main specifications, inputs, and outputs. 4.3 Downstream analyses After finding DEGs, volcano plots and heat maps are commonly used to graphically represent them, plotting relevant information about them and their expression levels, respectively. In Chapter 5 we’ll also check how to create and interpret these plots. References Smyth, G. K., Ritchie, M., Thorne, N., Wettenhall, J., Shi, W., & Hu, Y. (2002). limma: linear models for microarray and RNA-Seq data user’s guide. Bioinformatics Division, The Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia. Chen, Y., Lun, A. T., & Smyth, G. K. (2016). From reads to genes to pathways: differential expression analysis of RNA-Seq experiments using Rsubread and the edgeR quasi-likelihood pipeline. F1000Research, 5. "],["differential-gene-expression-analysis-with-limma-voom.html", "5 Differential Gene Expression analysis with limma-voom 5.1 NB-based DGE methods? 5.2 limma-voom pipeline 5.3 DE visualization References", " 5 Differential Gene Expression analysis with limma-voom Instructor: Daianna González Padilla In this chapter you’ll learn how DGE analysis is performed under the empirical Bayes framework of the popular limma-voom pipeline, highlighting key assumptions and concepts, and main differences with other methodologies. 5.1 NB-based DGE methods? An initial central point of discussion around DGE method development is how to model the distribution of the reads. Many methods model the read counts (\\(y_{k,ij}\\), non-negative integers) of a gene \\(i\\) in the \\(j\\) samples of condition \\(k\\) through the Poisson or the Negative Binomial (NB) distribution. Of these, NB is often preferred as it allows the mean (\\(\\mu\\)) and the variance (\\(\\sigma\\)) of the reads to be different, compared to the Poisson distribution where \\(\\mu\\)=\\(\\sigma\\). This is of particular importance as controlling the variance allows to account for variability in the gene expression levels across biological samples [1]. Figure 1: NB-distributed read counts. Modeling of read counts for gene \\(i\\) in the samples of the first and second conditions based on the NB model. Modified from Li, W. V., & Li, J. J. (2018). Estimating the NB distribution parameters is necessary to assess DE of each gene \\(i\\) between any two conditions \\(k=1,2\\) (Figure 1). Bayesian models are used defining prior distributions and relationships of such parameters. Briefly, after 1) estimating gene-wise NB parameters, 2) the mean-variance relationship across all genes can be used to shrink the gene variance estimations borrowing information from all genes or incorporating prior knowledge, something advantageous when sample sizes are small. 3) A statistical test is used to assess for each gene \\(i\\) if its true expression in the first and second condition (\\(\\theta_{1i}\\) and \\(\\theta_{2i}\\)) is the same (null hypothesis) or differs (alternative hypothesis): \\(H_0: \\theta_{1i}=\\theta_{2i}\\) \\(H_1: \\theta_{1i}≠\\theta_{2i}\\), where the \\(\\theta_{i}\\)’s are parameters included in the mean of the NB distributions (\\(\\mu\\)). 4) The test statistic is computed for each gene and 5) its associated p-value is calculated based on the null distribution. 6) Finally, p-values are corrected for multiple-testing and DEGs are determined based on an adjusted p-values cutoff [1]. Examples of popular methods based on the NB distribution are edgeR and DESeq2. Nevertheless, one limitation NB-based methods face is that they set dispersion of the data as a known and global parameter, ignoring observation-specific variation and importantly, there’s a reduced number of statistical methods for count distributions compared to the normal distribution [1,2]. Here, we’ll focus on limma that does not rely on a certain distribution but rather works on \\(log_2(cpm)\\) (CPM: counts per million) and fits linear models for DGE enabling the incorporation of additional predictors to model gene expression, a feature specially valuable for complex experimental settings. 5.2 limma-voom pipeline limma is a package for the analysis of gene expression data arising from microarray or RNA-seq technologies. It has features that make the analyses stable even for experiments with small number of arrays or samples —this is achieved by borrowing information across genes. It is specially designed for analyzing complex experiments with a variety of experimental conditions and predictors [3]. Usually, limma DGE analysis is carried out in five main steps, the last four of them completed by limma R functions, as described below. We’ll use bulk RNA-seq data from the smokingMouse package to exemplify these steps. ## Load the container package for RSE library("SummarizedExperiment") ## Connect to ExperimentHub library("ExperimentHub") eh <- ExperimentHub::ExperimentHub() ## Load package datasets myfiles <- query(eh, "smokingMouse") ## Download the mouse gene data rse_gene <- myfiles[["EH8313"]] ## Samples from the nicotine experiment and from pups only rse_gene_nic <- rse_gene[, which(rse_gene$Expt == "Nicotine" & rse_gene$Age == "Pup")] ## Retain only expressed genes (passed the filtering step) rse_gene_filt <- rse_gene_nic[ rowData(rse_gene_nic)$retained_after_feature_filtering, ] Let’s explore a little the data. ## Data dimensions: number of genes and samples dim(rse_gene_filt) #> [1] 19974 42 ## Raw counts for first 3 genes in the first 5 samples assays(rse_gene_filt)$counts[1:3, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> ENSMUSG00000051951.5 2652 2107 1978 2691 1833 #> ENSMUSG00000102331.1 15 15 9 15 13 #> ENSMUSG00000025900.13 10 7 28 11 8 ## Log-normalized counts for first 3 genes in the first 5 samples assays(rse_gene_filt)$logcounts[1:3, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> ENSMUSG00000051951.5 5.639967 5.953457 5.4923034 5.903313 5.800879 #> ENSMUSG00000102331.1 -1.747878 -1.130265 -2.1809593 -1.517393 -1.282590 #> ENSMUSG00000025900.13 -2.295096 -2.173926 -0.6153596 -1.941338 -1.948814 ## Data for the first 2 samples head(colData(rse_gene_filt), 2) #> DataFrame with 2 rows and 71 columns #> SAMPLE_ID FQCbasicStats perBaseQual perTileQual perSeqQual perBaseContent GCcontent Ncontent #> SeqLengthDist SeqDuplication OverrepSeqs AdapterContent KmerContent SeqLength_R1 percentGC_R1 phred15-19_R1 #> phred65-69_R1 phred115-119_R1 phred150-151_R1 phredGT30_R1 phredGT35_R1 Adapter65-69_R1 Adapter100-104_R1 #> Adapter140_R1 SeqLength_R2 percentGC_R2 phred15-19_R2 phred65-69_R2 phred115-119_R2 phred150-151_R2 phredGT30_R2 #> phredGT35_R2 Adapter65-69_R2 Adapter100-104_R2 Adapter140_R2 ERCCsumLogErr bamFile trimmed numReads #> numMapped numUnmapped overallMapRate concordMapRate totalMapped mitoMapped mitoRate totalAssignedGene rRNA_rate #> Tissue Age Sex Expt Group Pregnant plate location concentration #> medium date Pregnancy flowcell sum detected subsets_Mito_sum subsets_Mito_detected #> subsets_Mito_percent subsets_Ribo_sum subsets_Ribo_detected subsets_Ribo_percent retained_after_QC_sample_filtering #> retained_after_manual_sample_filtering #> [ reached getOption("max.print") -- omitted 3 rows ] 📝 Exercise 1: in order for you to perform a DGE analysis, locate your own RNA-seq datasets if you have any, or download expression data from a study of your interest and build a RSE object using recount3 (see Chapter 3: recount3 introduction). A third option you have is to download gene expression data from the smokingMouse package used here. A fourth option is to download data from GEO as Sean Davis will explain next. We’ll have more time tomorrow for doing this exercise with data of your choosing. 5.2.1 model.matrix() limma fits a linear model to the expression data of each gene (response variable), modeling the systematic part of the data by sample-level covariates (predictors). p.exercise { background-color: #FFFAFA; padding: 15px; border: 2px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.info { background-color: #FFFFF0; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.conclusion { background-color: #EEE9E9; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.question{ background-color: #E3E3E3; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 13px; font-family: sans-serif; } p.comment { background-color: #F0F0F0; padding: 20px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.alert { background-color: #FFE4E1; padding: 14px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.success { background-color: #E0EEE0; padding: 14px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } 💡 A model is a specification of how a set of variables relate to each other. In the case of a linear model, it is a linear equation that describes how the dependent or response variable is explained by the independent variables, also called predictors. A regression analysis with more than one independent variable is called multiple regression. Regression with only one independent variable is called simple regression [4]. The limma model is specified with a design matrix, also known as model matrix or regressor matrix, often denoted by \\(X\\). This is a matrix of values for explanatory variables of the samples: rows correspond to samples and columns to sample variables. Say that the values the \\(i\\)th sample take in the \\(h\\) covariates are \\(X_{ih}\\)’s and their coefficients are \\(\\beta_{h}\\)’s. The predicted expression of a gene in the \\(i\\)th sample is given by \\(\\hat y_i =\\beta_0 + \\sum_{1}^h\\beta_{h}X_{ih}\\). \\[ \\hat y = X\\beta=\\displaystyle {\\begin{bmatrix} \\hat y_{1}\\\\ \\hat y _{2}\\\\ \\hat y_{3}\\\\...\\\\ \\hat y_{n-1}\\\\ \\hat y_{n}\\end{bmatrix}}={\\begin{bmatrix}1&X_{11}&X_{12}&X_{13}&\\cdots&X_{1,h-1}&X_{1h}\\\\1&X_{21}&X_{22}&X_{23}&\\cdots&X_{2,h-1}&X_{2h}\\\\1&X_{31}&X_{32}&X_{33}&\\cdots&X_{3,h-1}&X_{3h} \\\\ \\vdots & \\vdots & \\vdots & \\vdots & \\ddots & \\vdots & \\vdots \\\\1&X_{n-1,1}&X_{n-1,2}&X_{n-1,3}&\\cdots&X_{n-1,h-1}&X_{n-1,h} \\\\1&X_{n,1}&X_{n,2}&X_{n,3}&\\cdots&X_{n,h-1}&X_{n,h} \\end{bmatrix}}{\\begin{bmatrix}\\beta _{0}\\\\\\beta _{1}\\\\\\beta _{2}\\\\\\beta_{3}\\\\...\\\\\\beta_{h-1}\\\\\\beta_{h}\\end{bmatrix}} \\] where \\(n\\) is the number of samples. In the first step we create this matrix using model.matrix() that receives a formula with the variables to include in the models and the sample data. ## Define formula formula <- ~ Group + Sex + flowcell + mitoRate + overallMapRate + totalAssignedGene + detected + ERCCsumLogErr ## Model matrix model <- model.matrix(formula, data = colData(rse_gene_filt)) head(model) #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX flowcellHKCTMDSXX mitoRate overallMapRate #> 1 1 0 0 0 1 0 0.03876995 0.9811 #> 2 1 1 0 0 1 0 0.03337699 0.9791 #> 3 1 0 1 0 1 0 0.03606147 0.9825 #> 4 1 1 1 1 0 0 0.03962591 0.9855 #> totalAssignedGene detected ERCCsumLogErr #> 1 0.7715862 26545 -67.33211 #> 2 0.7778447 24545 -66.38868 #> 3 0.7870034 25640 -58.89350 #> 4 0.7786461 25905 -84.91929 #> [ reached getOption("max.print") -- omitted 2 rows ] ❓ Which variables to include as covariates in the models? A straightforward strategy is to keep the model as simple as possible and after fitting the model evaluate the comparisons of interest [3]. In Chapter 7 we will discuss how correlation and variance partition analyses can help us to set up the best models. ⚠️ Very important: always check which condition group is set as the reference in you model for the coefficient/contrast of interest (column named as [Coefficient_name][Reference_Group]; corresponding reference group set to 1) as this determines if a DEG is up or downregulated in the given condition compared to the other. ## Comparison of interest: Group coef <- "GroupExperimental" 📝 Exercise 2: identify the sample data of your study and create the respective design matrix. Which is the reference group for your main variable of interest? Tomorrow we will learn how to use ExploreModelMatrix for helping us interpret coefficients. 5.2.2 voom() Compared to NB-based methods, limma works with \\(log2(cpm)\\) which are approximately normally distributed (as we have seen) and thus, opens the possibility to leverage a wide range of normal-based statistical tools not available for count distributions, including methods developed for microarray data. However, limma doesn’t assume nor require data to follow a normal distribution, but it does apply normal-based microarray-like statistical methods to RNA-seq read counts [2]. “… limma does not make any assumption that the data appears normal in a histogram.” - Gordon Smyth, author of limma, in the Bioconductor support website 2021. The benefit of using \\(log2(cpm)\\), however, is not immediate. One limitation for the direct application of normal-based methods to log-counts is that reads counts have unequal variabilities even after a log-transformation depending on the count sizes: probability distributions for counts are naturally heteroscedastic, with log-cpm not having constant variances (larger variances for larger counts) [2]. It has been proposed that to design powerful statistical analysis for RNA-seq, it is more important to model the relationship between the mean and the variance in the data than to specify which probabilistic distribution to use for the counts [2]. And importantly, converting count data taking such relationship into account does open up access to their analysis with normal-based methods. That’s why we use voom(). What voom() does is: First, to compute log-cpm. Log-normalized expression for gene \\(g\\) in sample \\(i\\) (\\(y_{gi}\\)) is given by \\[ y_{gi}=log_2(\\frac{r_{gi} + 0.5}{R_i + 1.0} \\times 10^6) \\] where \\(r_{gi}\\) is the raw count for the gene in the sample and \\(R_i\\) the library size of the sample. We add +0.5 to the counts to avoid log of zero and +1 to the library size to ensure that \\(\\frac{r_{gi}+0.5}{R_i+1}\\) is strictly less than 1 (if \\(r_{gi} = R_i\\)). A linear model is fitted to gene log-cpm values by ordinary least squares as: \\[ E(y_{gi})=\\mu_{gi}=X_i\\beta_g \\] where \\(E(y_{gi})\\) is the expected expression of gene \\(g\\) in sample \\(i\\), \\(X_i\\) is the vector with the sample values for the covariates and \\(\\beta_g\\) the vector of covariate coefficients for the gene. As a result, we have the estimated \\(\\hat\\beta_g\\), the fitted log-cpm’s \\(\\hat\\mu_{gi}=X_i\\hat\\beta_g\\) and the residual standard deviations \\(s_g\\). Then it estimates the mean-variance trend of the data by fitting a smooth curve to the \\(\\sqrt s_g\\) of the genes presented as a function of the average gene expression (in log-counts, not log-cpm). The \\(\\sqrt s_g\\)’s are used because they are symmetrically distributed. Log-counts typically show a decreasing mean-variance trend. voom() then predicts the standard deviation of each individual normalized observation \\(y_{gi}\\) (limma-trend does that at the gene level) using this trend curve: the fitted log-count of each observation is mapped to the curve and its \\(\\sqrt s_{gi}\\)value is obtained. The observation weights are \\(w_{gi}=\\frac{1}{s_{gi}^2}\\). Figure 2: voom() procedure to estimate observation-level variance weights for limma. Extracted from the original voom publication ( Law, C. W. et al. 2018). Log-cpm (\\(y_{gi}\\)) and associated weights (\\(w_{gi}\\)) can then be entered into the limma framework for linear modeling. These weights are used in the linear modeling to adjust for count heteroscedasticity [2]. library("limma") ## voom(): # 1. Transform counts to log2(cpm) # ---------------------------------------------------------------------------- # . | Note we passed voom() raw counts as input, not the lognorm counts!!! | # ---------------------------------------------------------------------------- # 2. Estimate mean-variance relationship for each gene # 3. Compute observation weights for limma (next step) vGene <- voom(assay(rse_gene_filt), design = model, plot = TRUE) Let’s explore the outpus of this function. ## Returned data names(vGene) #> [1] "E" "weights" "design" "targets" ## E: contains the computed log(cpm) dim(vGene$E) #> [1] 19974 42 vGene$E[1:5, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> ENSMUSG00000051951.5 5.906572 6.1425731 5.7434780 6.133741 6.061250 #> ENSMUSG00000102331.1 -1.512368 -0.9445475 -1.9587859 -1.306258 -1.024247 #> ENSMUSG00000025900.13 -2.074247 -1.9918532 -0.3738234 -1.736892 -1.691672 #> ENSMUSG00000025902.13 1.446325 1.2611275 1.3707154 1.419026 1.688471 #> ENSMUSG00000098104.1 1.572354 1.2408075 1.4727667 1.404882 1.533748 ## weights: contains the computed variance weight for each observation dim(vGene$weights) #> [1] 19974 42 vGene$weights[1:5, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> [1,] 143.326885 117.323375 139.214140 141.247546 128.818305 #> [2,] 4.255525 4.277395 2.698902 5.113520 3.377285 #> [3,] 4.009671 3.341317 5.555186 4.020098 2.546810 #> [4,] 20.584769 15.108579 15.521441 19.219652 16.893714 #> [5,] 22.473314 16.369739 18.359068 17.691839 14.325510 ## design: is the provided design matrix head(vGene$design) #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX flowcellHKCTMDSXX mitoRate overallMapRate #> 1 1 0 0 0 1 0 0.03876995 0.9811 #> 2 1 1 0 0 1 0 0.03337699 0.9791 #> 3 1 0 1 0 1 0 0.03606147 0.9825 #> 4 1 1 1 1 0 0 0.03962591 0.9855 #> totalAssignedGene detected ERCCsumLogErr #> 1 0.7715862 26545 -67.33211 #> 2 0.7778447 24545 -66.38868 #> 3 0.7870034 25640 -58.89350 #> 4 0.7786461 25905 -84.91929 #> [ reached getOption("max.print") -- omitted 2 rows ] ## targets: the sample library sizes used to compute log(cpm) in the first step dim(vGene$targets) #> [1] 42 1 head(vGene$targets) #> lib.size #> 1 44218086 #> 2 29831069 #> 3 36929795 #> 4 38331383 #> 5 27457620 #> 6 27113922 identical(vGene$targets$lib.size, colSums(assay(rse_gene_filt))) #> [1] TRUE ➡️ In summary, voom() estimates non-parametrically the global mean-variance trend of the count data based on the expression of the genes and uses that to predict the variance of each individual expression observation (each log-cpm value) based on their predicted count sizes. The predicted variances are then associated as inverse weights to each observation that when used in linear modeling eliminate the log-cpm mean-variance trend [2]. 👉🏼 Advantages: ✅ voom() estimates the mean-variance relationship in a non-parametric way. “The parametric advantages of the Poisson or NB distributions are mitigated by the fact that the observed mean-variance relationship of RNA-seq data does not perfectly match the theoretical mean-variance relationships inherent in these distributions. While the quadratic mean-variance relationship of the NB distribution captures most of the mean-variance trend, the NB dispersion still shows a non-ignorable trend with gene abundance.” [2] ✅ Since voom() is a method to adapt count data to normal models, these give access to tractable empirical Bayes distribution theory. ✅ The use of normal distribution approaches and variance modeling is supported by generalized linear model theory. 📝 Exercise 3: compute the \\(log2(cpm)\\) and the residual variance weights for each observation in your data using voom(). 5.2.3 lmFit() This limma function fits a multiple linear model to the expression of each gene by weighted or generalized least squares to estimate the coefficients of the sample covariates which correspond to the logFC’s comparing gene expression between sample groups. Ordinary least squares (OLS) This is used to estimate the coefficients of a linear regression by minimizing the residual sum of squares [5]. Figure 3: Graphical representation of the OLS method for simple regression analysis. Source: Gulve, A. (2020). Ordinary Least Square (OLS) Method for Linear Regression. For simplicity, let’s work with one gene and say we have \\(n\\) samples. The fitted expression of the gene in the \\(j\\)th sample is \\(\\hat y_j =\\beta_{0} + \\sum_{1}^h\\beta_{h}X_{jh}\\) , where \\(\\beta_h\\) is the coefficient for the \\(h\\)th covariate and \\(X_{jh}\\) the value the \\(j\\)th sample takes for the \\(h\\)th covariate. It can also be written as \\(\\hat y_j =\\sum_{0}^h\\beta_{h}X_{jh}\\) if \\(X_{j0}=1\\). So we have an overdetermined system of \\(n\\) linear equations and \\(h\\) unknown parameters with \\(n>h\\): \\(\\hat y_j =\\sum_{0}^h\\beta_{h}X_{jh}\\) with \\(j=(1,2, ..., n)\\). Such system usually has no exact solution, so we need to estimate the coefficients that better fit the data in a linear regression. The problem is reduced to solving a quadratic minimization problem: \\(\\hat \\beta=arg \\ _\\beta\\ min \\ \\ S(\\beta)\\) where \\(S(\\beta)=\\sum_j(y_j -\\hat y_j)^2=RSS\\) (residual sum of squares). 💡 We can think of these \\(\\beta\\)’s as differences in the fitted (expected) expression of a gene. Say we have two binary categorical variables in the model (\\(X_1\\) and \\(X_2\\)), then the expected gene expression in a sample is \\(E(y|X_1, X_2) =\\hat y =\\beta_{0} + \\beta_1X_1+\\beta_2X_2\\), where \\(X_1\\) and \\(X_2\\) equal to 1 or 0. Then we have the following 4 combinations: \\(E(y|X_1=1, X_2=1) = \\mu_{12}=\\beta_{0} + \\beta_1+\\beta_2\\) \\(E(y|X_1=1, X_2=0) =\\mu_{1}=\\beta_{0} + \\beta_1\\) \\(E(y|X_1=0, X_2=1) =\\mu_{2}=\\beta_{0} + \\beta_2\\) \\(E(y|X_1=0, X_2=0) =\\mu_{0}=\\beta_{0}\\) So \\(\\beta_1=\\) \\(\\mu_1-\\mu_0\\) \\(=\\) \\(\\mu_{12}-\\mu_2\\) and \\(\\beta_2=\\) \\(\\mu_2\\)\\(-\\) \\(\\mu_0\\). Say our variable of interest is \\(\\beta_1\\). Then what we are testing is if the expected expression of a gene is different when \\(X_1=1\\) (in the first sample group) and \\(X_1=0\\) (in the second sample group), fixing \\(X_2\\) in either 1 or 0. Generalized least squares (GLS) Is a generalization of OLS that allows for heteroskedasticity and correlation between the residuals [6]. Weighted least squares (WLS) In this case the function to be minimized becomes the weighted sum of the squared residuals: squared residuals are weighted by the reciprocal of their variance so that more noisy observations have less weight. That’s what we used voom() for. lmFit() returns a fitted model object with the estimated coefficients, standard errors (\\(SE=sd/\\sqrt n\\)) and residual standard errors/deviations (\\(RSE=s_g=\\sqrt {RSS/ n-2}\\)) for each gene. Depending on the arguments and correlations in the data, this function calls one of the following functions to fit a linear model for each gene [7]: mrlm: for a robust regression if method=\"robust”. gls.series: GLS estimator if method=\"ls” and a correlation structure has been specified. lm.series: OLS method if method=\"ls” and there is no correlation structure. For the weights argument of lmFit(), the precision weights for the observations previously computed are extracted from the voom() output. ## lmFit(): # 1. Fit linear model for each gene to estimate logFCs fitGene <- lmFit(vGene) ## Corroborate "ls" method was applied fitGene$method #> [1] "ls" ## Explore outputs: estimated coefficients (logFCs) head(fitGene$coefficients) #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX flowcellHKCTMDSXX #> ENSMUSG00000051951.5 -35.637900 -0.05125195 0.05690091 -0.47469588 -0.38545404 -0.66545820 #> ENSMUSG00000102331.1 37.943310 0.72450620 0.19887963 -0.20803712 -0.40926270 -0.10900553 #> ENSMUSG00000025900.13 -43.586603 0.17256694 0.28895786 -0.04476551 0.15257245 -0.06949759 #> ENSMUSG00000025902.13 5.657837 -0.05025788 -0.04808144 -0.18732331 -0.26041436 0.07364071 #> mitoRate overallMapRate totalAssignedGene detected ERCCsumLogErr #> ENSMUSG00000051951.5 -11.487040 37.443647 4.753830 6.442499e-05 -0.0043851842 #> ENSMUSG00000102331.1 -21.871815 27.323603 -79.223111 -1.026687e-04 0.0166190950 #> ENSMUSG00000025900.13 16.727251 7.179484 42.367908 8.535604e-05 0.0133260060 #> ENSMUSG00000025902.13 -24.538883 11.174017 -18.548282 -1.231546e-05 -0.0092194951 #> [ reached getOption("max.print") -- omitted 2 rows ] 💡 Interaction terms in linear models There may be cases where we want to assess gene expression differences between 2 conditions within more than one specific group; for example if we were interested in knowing what are the effects of a treatment (\\(X_1=1\\) for treatment and 0 for controls) in females and males separately (\\(X_2=1\\) for females and 0 for males). In such cases we can fit an interaction model in which we include the product of \\(X_1\\) and \\(X_2\\) so that \\(X_1X_2=1\\) if a sample comes from a female that was treated and 0 otherwise: \\[E(y|X_1, X_2) =\\beta_{0} + \\beta_1X_1+\\beta_2X_2 + \\beta_3X_1X_2\\] \\(E(y|X_1=1, X_2=1) =\\mu_{12} =\\beta_{0} + \\beta_1+\\beta_2+\\beta_3\\) \\(E(y|X_1=1, X_2=0) =\\mu_{1} =\\beta_{0} + \\beta_1\\) \\(E(y|X_1=0, X_2=1) =\\mu_{2} =\\beta_{0} + \\beta_2\\) \\(E(y|X_1=0, X_2=0) =\\mu_{0} =\\beta_{0}\\) \\(\\beta_1 + \\beta_3=\\) \\(\\mu_{12}-\\mu_2\\) which is the expression difference between treated and control female samples (\\(X_2=1\\)) and \\(\\beta_1 =\\) \\(\\mu_{1}-\\mu_0\\) for male samples (\\(X_2=0\\)). Finally \\(\\beta_3\\), called the interaction term, is (\\(\\mu_{12}-\\mu_2\\))\\(-\\)(\\(\\mu_1-\\mu_0\\)), described as the difference in gene expression changes driven by the treatment in females compared to males [8]. 📝 Exercise 4: fit a linear regression model to the expression data of your genes and extract the coefficients for the included covariates. 5.2.4 eBayes() Next, we want to assess if the differences in gene expression between the sample groups are statistically significant. Initially, we can think of comparing the mean expression of a gene in the sample groups (e.g. cases and controls) which can be handled applying a two-sample t-test assuming that the values in both groups have an approximately normal distribution. Here we use the t-score (t-stats) to define if the difference in the means is statistically significant based on a t-distribution. The t-stats is given by: \\[ t=\\frac{\\bar x_1 - \\bar x_2}{\\sqrt{\\frac{s_1^2}{n_1}+\\frac{s_2^2}{n_2}}} \\] where \\(\\bar x_1\\) and \\(\\bar x_2\\) are the means of the expression values of a gene in the first and second sample groups, \\(s_1\\) and \\(s_2\\) are the sample standard deviations of gene expression in the same groups, and \\(n_1\\), \\(n_2\\) the corresponding sample group sizes: \\(s_1 = \\sqrt{\\frac{\\sum_{i=1}^ {n_1} (x_i-\\bar x_1)^2}{n_1-1}}\\) and \\(s_2 = \\sqrt{\\frac{\\sum_{j=1}^ {n_2} (x_j-\\bar x_2)^2}{n_2-1}}\\), with \\(x_i\\) and \\(x_j\\) the gene expression values in the samples of group 1 and 2, respectively. ➡️ Note that we say sample means and sample standard deviations because they are estimators of the population parameters, computed based on the data that we have. We can think of this t-stats as a ratio of signal and noise. The numerator contains the difference between the two means, taken as the signal for DE. The denominator corresponds to the standard error and represents the noise in terms of gene expression variance within the sample groups. This represents how spread out the signal is [9]. In that way, the t-stats is a measure of how strong is the DE signal. Once computed, the t-stats have an associated p-value based on a Student t-distribution under the null hypothesis (\\(H_o\\): \\(\\bar x_1 - \\bar x_2=0\\)). This is exactly what we can get using lm(): ## Lognorm expression of first gene rse_gene_one_gene <- rse_gene_filt[1, ] colData(rse_gene_one_gene) <- cbind(colData(rse_gene_one_gene), "lognorm_expr" = assays(rse_gene_one_gene)$logcounts[1, ] ) ## Fit simple linear model formula <- lognorm_expr ~ Group lm <- lm(formula, data = colData(rse_gene_one_gene)) summary(lm) #> #> Call: #> lm(formula = formula, data = colData(rse_gene_one_gene)) #> #> Residuals: #> Min 1Q Median 3Q Max #> -1.05368 -0.06304 0.03012 0.10254 0.24844 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 5.75377 0.04502 127.800 <2e-16 *** #> GroupExperimental -0.04292 0.06694 -0.641 0.525 #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> Residual standard error: 0.2159 on 40 degrees of freedom #> Multiple R-squared: 0.01017, Adjusted R-squared: -0.01457 #> F-statistic: 0.4111 on 1 and 40 DF, p-value: 0.525 ## Two sample t-test t.test(formula, data = colData(rse_gene_one_gene), var.equal = TRUE) #> #> Two Sample t-test #> #> data: lognorm_expr by Group #> t = 0.64121, df = 40, p-value = 0.525 #> alternative hypothesis: true difference in means between group Control and group Experimental is not equal to 0 #> 95 percent confidence interval: #> -0.09236465 0.17820636 #> sample estimates: #> mean in group Control mean in group Experimental #> 5.753765 5.710845 💡 Sample sizes are critical! Larger sample sizes increase the power of the tests and reduce the false discovery rate (FDR) as they decrease the denominator of the t-stats (increasing their values) and slight differences can then be detected. ⚠️ Now consider that for genes with small variances in their expression the t-stats will be greater and we could be detecting non-DEGs as DE (false positives). But two things must be considered at least when working with gene expression data: The first is that expression values are usually not normally distributed. Second, the distributions and variances of expression values vary across genes and conditions. ➡️ With that in mind, inference at the individual gene level can be addressed borrowing information from all the genes in the experiment through a Bayes or empirical Bayes method that produces more powerful tests. The idea of Bayesian statistics is to give unknown quantities a prior distribution, considering each feature as a member of a population of features such as genes. More specifically, empirical Bayes methods are procedures for statistical inference in which the (empirical) prior distribution is estimated from the population of all features (from the data) [8]; in standard Bayesian methods this prior distribution is fixed before observing any data [10]. Inspired by the work of Lönnstedt and Speed (2002) in which a simple expression for the posterior odds of differential expression for each gene was computed using a parametric empirical Bayes approach, Smyth, G. K. (2004) generalized this model for its application to experiments with any numbers of samples and conditions and reformulated the posterior odds statistic in terms of a moderated t-statistic in which the posterior residual standard deviations are used instead of the ordinary ones, eliminating the requirement of knowing the non-null prior guess for the proportion of differentially expressed genes required in the log-odds [11]. Let’s see how it proceeds. First, for each gene \\(g\\) we have a vector with the expression values in the \\(n\\) samples: \\[ y_{g}= (y_{g1}, y_{g2}, ..., y_{gn}) \\] We already know that the expected (predicted) gene expression in the samples is \\(E(y_{g})=X\\alpha_g\\) with \\(X\\) the design matrix and \\(\\alpha_g\\) the vector of the coefficients for the \\(h\\) covariates in the model; of these the ones of biological interest are the \\(\\beta_g\\)’s (contrasts of interest). Then, as previously described, a linear model is fitted to the expression data for each gene to obtain the coefficient estimators (\\(\\hat \\alpha_g\\)) (as well as \\(\\hat \\beta_g\\)), the residual sample variances (\\(s_g^2\\)) as estimators of the (true but unknown) residual variances (\\(\\sigma_g^2\\)), and the estimated covariance matrices. Two relevant considerations here: The expression values are not necessarily assumed to be normally distributed. The linear model is not assumed to be necessarily by least squares. However, there are two assumptions: The contrast estimators \\(\\hat \\beta_g\\) are assumed to be approximately normally distributed with mean \\(\\beta_g\\). The residual sample variances (\\(s_g^2\\)) are assumed to follow approximately a scaled chisquare distribution. Under such assumptions the ordinary t-stats for the covariate \\(j\\) in the gene \\(g\\) is defined by: \\[ t_{gj}=\\frac{\\hat \\beta_{gj}}{s_g u_{gj}}=\\frac{\\hat \\beta_{gj}}{SE(\\hat \\beta_{gj})} \\] with \\(s_g\\) the residual sample standard deviation of the gene and \\(u_{gj}\\) the unscaled standard deviation. \\(SE\\) stands for standard error. The key step in the empirical Bayes approach of limma is to leverage the information across all genes by defining prior distributions for the unknown coefficients \\(\\beta_{gj}\\) and residual variance \\(\\sigma_g^2\\) of the genes. For \\(\\beta_{gj}\\) it models the prior distribution of the coefficients that are not zero, i.e. is the expected distribution of the logFC of the genes that are DE, is given by: \\[\\beta_{gj}|\\sigma_{g}^2, \\beta_{gj}≠0 \\sim N(0, v_{0j}\\sigma_g^2)\\] “Saying that the betas have prior information centered around zero implies that we are ignorant of the sign (+/-) of the beta.” – Vincent Carey (personal communication) For the residual variances what limma does is to take the residual sample variances of all genes (\\(s_g^2\\)’s) and estimate the empirical parameters of the gamma distribution it is assumed that they follow. Specifically, \\(\\frac{1}{\\sigma_g^2}\\) is modeled by a scaled chisquare (gamma) prior distribution with mean \\(\\frac{1}{s_0^2}\\) and \\(d_0\\) degrees of freedom, describing how the residual variances are expected to vary across genes: \\[\\frac{1}{\\sigma_g^2} \\sim \\frac{1}{d_0s_0^2}\\chi_{d_0}^2\\] What we want to do next is not only to take the mean of the residual variances in the distribution (prior mean\\(\\frac{1}{s_0^2}\\)) but to estimate each gene residual variance as a Bayes predictor: as a weighted average of the prior mean (\\(\\frac{1}{s_0^2}\\)) and the observed sample variance (\\(s_g^2\\)) of each gene. This is called the moderated variance and what is graphically happening is that we are pulling the observed gene variances towards the prior mean variance: large variances are reduced and the |t-stats| increases (more powerful t-test for those genes) and small variances are increased, decreasing the |t-stats| and the power of the t-test. Under this model the posterior residual sample variance or posterior residual variance estimator (\\(\\tilde s_g^2\\)) is: \\[ \\tilde s_g^2=E(\\sigma_g^2|s_g^2)=\\frac{d_0s_0^2 + d_gs_g^2}{d_0+d_g} \\] Moderation is somehow like having larger sample sizes for the estimation of variance given that the moderated variances are (on average) closer to the population variance than the original sample variances. The moderated t-statistic can be now defined in terms of this posterior residual sample standard deviations instead of the usual ones: \\[ \\tilde t_{gj}=\\frac{\\hat \\beta_{gj}}{\\tilde s_g u_{gj}} \\] These moderated t-stats follow a t-distribution under the null hypothesis (\\(H_o:B_{gj}=0\\)) with degrees of freedom \\(d_g+d_0\\) and the associated p-values can be computed based on such distribution. As previously stated, with this redefined formula, large t-stats merely from very small \\(s_g\\)’s are avoided. This results in increased power and reduced false non-discovery rate (FNR) (non detected DEGs) and the number of DEGs obtained increases [8]. In the end we say we have moderated the residual sample standard deviations of each gene in the t-stats denominator by using the distribution of all of them across the population of genes. ✅ The approach of using the posterior values results in shrinking the gene-wise residual sample variances (\\(s_g^2\\)) towards the prior mean, making a more stable inference when a small number of samples is available. eBayes() will implement this empirical Bayes model to compute for each gene and for each contrast these moderated t-statistic and their unadjusted p-values. Additionally, it returns moderated F-statistic and log-odds of differential expression. The moderated F-statistic tests whether any of the contrasts for a gene is non-zero (\\(H_0:B_{g}=0\\)), i.e., whether that gene is differentially expressed for any contrast; it is similar to the ordinary F-statistic from analysis of variance (ANOVAR). The t-test does that for each individual contrast \\(j\\) (\\(H_0:B_{gj}=0\\)). 👉🏼 Check more about F-stats and other statistics computed by eBayes() here: https://support.bioconductor.org/p/6124/. ## eBayes() ## 1. Compute the empirical Bayes statistics for DE eBGene <- eBayes(fitGene) ## Outputs of interest: ## s2.prior -> prior residual variance (prior mean 1/s0^2) ## in prior distribution of residual variances eBGene$s2.prior #> [1] 0.78987 ## df.prior -> degrees of freedom d0 in prior distribution ## of residual variances eBGene$df.prior #> [1] 4.913248 ## s2.post -> posterior residual sample variances of the genes (~sg^2) length(eBGene$s2.post) #> [1] 19974 head(eBGene$s2.post) #> [1] 2.3397702 0.7092520 1.1613995 0.9579389 0.7390718 0.4996251 ## t -> moderated t-stats of the genes for each contrast dim(eBGene$t) #> [1] 19974 11 eBGene$t[1:5, 1:5] #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX #> ENSMUSG00000051951.5 -4.4458336 -1.0615386 1.2467597 -4.8896590 -4.2555598 #> ENSMUSG00000102331.1 1.5930925 4.5298885 1.3144392 -0.6934431 -1.3930231 #> ENSMUSG00000025900.13 -1.2896585 0.8392518 1.4887297 -0.1047634 0.3829694 #> ENSMUSG00000025902.13 0.4035855 -0.5885950 -0.5891347 -1.0922124 -1.6131630 #> ENSMUSG00000098104.1 0.7120536 -0.3614893 -1.0031002 0.6473100 1.0165548 ## p.value: corresponding unadjusted p-values of moderated t-stats dim(eBGene$p.value) #> [1] 19974 11 eBGene$p.value[1:5, 1:5] #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX #> ENSMUSG00000051951.5 8.086618e-05 2.955320e-01 0.2205562 2.114989e-05 0.0001425797 #> ENSMUSG00000102331.1 1.199041e-01 6.283899e-05 0.1970317 4.924924e-01 0.1721763827 #> ENSMUSG00000025900.13 2.054130e-01 4.068800e-01 0.1452882 9.171465e-01 0.7039999597 #> ENSMUSG00000025902.13 6.889106e-01 5.598170e-01 0.5594589 2.820128e-01 0.1154631942 #> ENSMUSG00000098104.1 4.810321e-01 7.198519e-01 0.3225237 5.215478e-01 0.3161651149 📝 Exercise 5: obtain the moderated t-stats and associated p-values of all genes in you data for all covariates included in your model. 5.2.5 topTable() This function is also provided by limma and summarizes the results of the linear model, performs hypothesis tests and adjusts the p-values for multiple testing [12]. Among the summary statistics presented, it returns the log2FCs, moderated t-statistics, p-values, and FDR-adjusted p-values of the genes for a given contrast of interest. The default form of p-value adjustment is the Benjamini and Hochberg’s method to control the false discovery rate (FDR) which assumes independence between genes. Relevant concepts: q-value → is the FDR-adjusted p-value used to control the False Discovery Rate (FDR) that is the expected proportion of false discoveries among the discoveries (DEGs). Selecting discoveries as those being below \\(\\alpha\\) in q-value, we control FDR ≤ \\(\\alpha\\). Now we have the final statistics to determine wich genes are DE. ## topTable() ## 1. Obtain gene-wise DE stats for Group (Nicotine vs Ctrl) top_genes <- topTable(eBGene, coef = coef, p.value = 1, number = nrow(rse_gene_filt), sort.by = "none") ## Outputs for each gene and for the coeff selected (Group): ## logFC: log2-fold-changes head(top_genes$logFC) #> [1] -0.05125195 0.72450620 0.17256694 -0.05025788 -0.02726320 -0.02684710 In limma the \\(\\beta_{gj}\\)’s are the logFC’s: setdiff(top_genes$logFC, eBGene$coefficients[, "GroupExperimental"]) #> numeric(0) ## t: moderated t-stats head(top_genes$t) #> [1] -1.0615386 4.5298885 0.8392518 -0.5885950 -0.3614893 -1.0959528 ## . P.value: unadjusted p-values of t-stats head(top_genes$P.Value) #> [1] 2.955320e-01 6.283899e-05 4.068800e-01 5.598170e-01 7.198519e-01 2.803946e-01 ## adj.P.Val: p-values adjusted to control the FDR head(top_genes$adj.P.Val) #> [1] 0.53854173 0.00412576 0.63704026 0.75340755 0.85943342 0.52324928 After running all these 5 steps, one main initial plot we have to look at is the histogram of the p-values of the moderated t-stats of the genes. If there were DEGs, we’d expect to see a flat distribution of p-values corresponding to non-DEGs and a peak near p=0 for DEGs (for which we reject the null hypothesis). If this peak is absent but a uniform distribution still appears, DEGs might be detected after correcting for multiple testing. ## Histogram of unadjusted p-values hist(top_genes$P.Value, xlab = "p-values", main = "") If very different p-value distributions are obtained from the uniform one, the best we can do is trying to explore if there are specific groups of genes (e.g. lowly-expressed genes) presenting such variable p-values and revisiting the assumptions and considerations of the statistical tests implemented [13]. 📝 Exercise 6: obtain the DE logFCs, t-stats, p-values, and adjusted p-values of the genes for a given constrast/covariate under study. 5.3 DE visualization DEGs are identified defining a significance threshold (on the adjusted p-values). Let’s quantify the number of DEGs for nicotine exposure in pup brain and visualize their expression and DE statistics. ## DEGs for FDR<0.05 de_genes <- top_genes[which(top_genes$adj.P.Val < 0.05), ] ## Number of DEGs dim(de_genes) #> [1] 1895 6 5.3.1 Volcano plots A very practical and useful plot to graphically represent DEGs and visualize their expression differences between conditions is a volcano plot. This is a scatter plot of the logFC’s of the genes in the x-axis vs their adjusted p-values in a -log scale in the y-axis. library("ggplot2") ## Define up- and down-regulated DEGs, and non-DEGs FDR <- 0.05 DE <- vector() for (i in 1:dim(top_genes)[1]) { if (top_genes$adj.P.Val[i] > FDR) { DE <- append(DE, "n.s.") } else { if (top_genes$logFC[i] > 0) { DE <- append(DE, "Up") } else { DE <- append(DE, "Down") } } } top_genes$DE <- DE ## Colors, sizes and transparencies for up & down DEGs and non-DEGs cols <- c("Up" = "indianred2", "Down" = "steelblue2", "n.s." = "grey") sizes <- c("Up" = 1.3, "Down" = 1.3, "n.s." = 0.8) alphas <- c("Up" = 0.4, "Down" = 0.6, "n.s." = 0.5) ## Plot volcano plot ggplot( data = top_genes, aes( x = logFC, y = -log10(adj.P.Val), color = DE, fill = DE, size = DE, alpha = DE ) ) + geom_point(shape = 21) + geom_hline( yintercept = -log10(FDR), linetype = "dashed", color = "gray35", linewidth = 0.5 ) + geom_vline( xintercept = c(-1, 1), linetype = "dashed", color = "gray35", linewidth = 0.5 ) + labs(y = "-log10(FDR)", x = "logFC(Nicotine vs Control)") + theme_bw() + scale_color_manual(values = cols, name = "Differential expression") + scale_fill_manual(values = cols, name = "Differential expression") + scale_size_manual(values = sizes, name = "Differential expression") + scale_alpha_manual(values = alphas, name = "Differential expression") + theme( plot.margin = unit(c(1, 1, 1, 1), "cm"), legend.key.height = unit(0.15, "cm"), axis.title = element_text(size = (13)), legend.title = element_text(size = 13), legend.text = element_text(size = 12) ) 5.3.2 Heat maps Another common way to represent differential expression results is through a heat map. The package ComplexHeatmap offers a flexible toolkit to easily create heat maps with row and column annotations, a feature of particular value to plot expression data of genes across samples with multiple biological and technical differences. Although initially all genes in your data can be plotted, frequently only DEGs are included as they tend to show clearer gene expression patterns. library("ComplexHeatmap") ## We plot lognorm counts lognorm_data <- assays(rse_gene_filt)$logcounts ## Subset to DEGs only lognorm_data <- lognorm_data[rownames(de_genes), ] ## Define column (sample) names to display colnames(lognorm_data) <- paste0("Pup_", 1:dim(lognorm_data)[2]) 🗒️ Notes: It is sometimes convenient to regress out the technical variables’ contributions on gene expression to see more clearly the effects of interest. This can happen, for instance, when the logFCs are too small to see any significant differences in the plots or when there are other strong confounding factors. Functions such as cleaningY() of jaffelab can be used for this purpose. The lognorm counts have to be correctly scaled and centered (around zero) to make the differences in the expression of the genes more notorious in the heat map. A simple way to do that is substracting from each lognorm count \\(y_{gi}\\) (from the gene \\(g\\) and sample \\(i\\)) the mean expression of the gene* and dividing by the standard deviation (\\(\\sigma\\)) of the same gene expression values. This is formally called the z-score: the number of standard deviations away from the mean. \\[ z=\\frac{y_{gi} - \\frac{\\sum_{k=1}^{n}{y_{gk}}}{n}}{\\sigma}, \\] \\(n\\) is the number of samples. * This can also be done by columns (samples), not only by rows (genes). 👉🏼 For more on centering and scaling, see this video: ## Center and scale the data to make differences more evident lognorm_data <- (lognorm_data - rowMeans(lognorm_data)) / rowSds(lognorm_data) ## Sample annotation: Sex, Group, and library size col_anno <- HeatmapAnnotation( df = as.data.frame(colData(rse_gene_filt)[, c("Sex", "Group")]), library_size = anno_barplot(colData(rse_gene_filt)$sum, gp = gpar(fill = "lightyellow2")), col = list( "Sex" = c("F" = "hotpink1", "M" = "dodgerblue"), "Group" = c("Control" = "gray68", "Experimental" = "gold2") ) ) ## Gene annotation: logFC and biotype de_genes$logFC_binary <- sapply(de_genes$logFC, function(x) { if (x > 0) { ">0" } else { "<0" } }) de_genes$protein_coding_gene <- sapply(rowData(rse_gene_filt[rownames(de_genes), ])$gene_type, function(x) { if (x == "protein_coding") { "TRUE" } else { "FALSE" } }) gene_anno <- rowAnnotation( df = as.data.frame(cbind( "logFC" = de_genes$logFC_binary, "protein_coding_gene" = de_genes$protein_coding_gene )), col = list( "logFC" = c("<0" = "deepskyblue3", ">0" = "brown2"), "protein_coding_gene" = c("TRUE" = "darkseagreen3", "FALSE" = "magenta") ) ) library("circlize") ## Plot Heatmap(lognorm_data, name = "lognorm counts", show_row_names = FALSE, top_annotation = col_anno, left_annotation = gene_anno, row_km = 2, column_km = 2, col = colorRamp2(c(-4, -0.0001, 00001, 4), c("darkblue", "lightblue", "lightsalmon", "darkred")), row_title = "DEGs", column_title = "Samples", column_names_gp = gpar(fontsize = 7), heatmap_width = unit(12.5, "cm"), heatmap_height = unit(12.5, "cm") ) 📝 Exercise 7: obtain the number of DEGs you got and represent them in a volcano plot and a heat map. Include all the sample and gene information you consider relevant in the latter. References Li, W. V., & Li, J. J. (2018). Modeling and analysis of RNA‐seq data: a review from a statistical perspective. Quantitative Biology, 6(3), 195-209. Law, C. W., Chen, Y., Shi, W., & Smyth, G. K. (2014). voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome biology, 15(2), 1-17. Smyth, G. K., Ritchie, M., Thorne, N., Wettenhall, J., Shi, W., & Hu, Y. (2002). limma: linear models for microarray and RNA-Seq data user’s guide. Bioinformatics Division, The Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia. van den Berg, S. M. (2022). Analysing data using linear models. Web site: https://bookdown.org/pingapang9/linear_models_bookdown/ Wikipedia. (n.d.). Ordinary least squares. Web site: https://en.wikipedia.org/wiki/Ordinary_least_squares Taboga, Marco (2021). “Generalized least squares”, Lectures on probability theory and mathematical statistics. Kindle Direct Publishing. Online appendix. https://www.statlect.com/fundamentals-of-statistics/generalized-least-squares. Documentation for lmFit: https://rdrr.io/bioc/limma/man/lmFit.html The Pennsylvania State University. (2018). Statistical Analysis of Genomics Data. Web site: https://online.stat.psu.edu/stat555/node/36/ Tushe, M. (2021). A Simple Trick to Understand the t-test. Web site: https://miroslavtushev.medium.com/a-simple-trick-to-understand-the-t-test-2c2a9e7f1dc5 Wikipedia. (n.d.). Empirical Bayes method. Web site: https://en.wikipedia.org/wiki/Empirical_Bayes_method#:~:text=Empirical Bayes methods are procedures,before any data are observed. Smyth, G. K. (2004). Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Statistical applications in genetics and molecular biology, 3(1). Documentation for topTable: https://www.rdocumentation.org/packages/limma/versions/3.28.14/topics/toptable Robinson, D. (2014). How to interpret a p-value histogram. Web site: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/ "],["interpreting-model-coefficients-with-exploremodelmatrix.html", "6 Interpreting model coefficients with ExploreModelMatrix 6.1 Model objects in R 6.2 ExploreModelMatrix 6.3 Example 1 6.4 Example 2 6.5 Example 3 6.6 Exercise 6.7 To learn more 6.8 Community", " 6 Interpreting model coefficients with ExploreModelMatrix Instructor: Leo 6.1 Model objects in R Linear regression review https://lcolladotor.github.io/bioc_team_ds/helping-others.html#linear-regression-example With R, we use the model.matrix() to build regression models using the Y ~ X1 + X2 formula syntax as exemplified below. ## ?model.matrix mat <- with(trees, model.matrix(log(Volume) ~ log(Height) + log(Girth))) mat #> (Intercept) log(Height) log(Girth) #> 1 1 4.248495 2.116256 #> 2 1 4.174387 2.151762 #> 3 1 4.143135 2.174752 #> 4 1 4.276666 2.351375 #> 5 1 4.394449 2.370244 #> 6 1 4.418841 2.379546 #> 7 1 4.189655 2.397895 #> 8 1 4.317488 2.397895 #> 9 1 4.382027 2.406945 #> 10 1 4.317488 2.415914 #> 11 1 4.369448 2.424803 #> 12 1 4.330733 2.433613 #> 13 1 4.330733 2.433613 #> 14 1 4.234107 2.459589 #> 15 1 4.317488 2.484907 #> 16 1 4.304065 2.557227 #> [ reached getOption("max.print") -- omitted 15 rows ] #> attr(,"assign") #> [1] 0 1 2 colnames(mat) #> [1] "(Intercept)" "log(Height)" "log(Girth)" How do we interpret the columns of our model matrix mat? summary(lm(log(Volume) ~ log(Height) + log(Girth), data = trees)) #> #> Call: #> lm(formula = log(Volume) ~ log(Height) + log(Girth), data = trees) #> #> Residuals: #> Min 1Q Median 3Q Max #> -0.168561 -0.048488 0.002431 0.063637 0.129223 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) -6.63162 0.79979 -8.292 5.06e-09 *** #> log(Height) 1.11712 0.20444 5.464 7.81e-06 *** #> log(Girth) 1.98265 0.07501 26.432 < 2e-16 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> Residual standard error: 0.08139 on 28 degrees of freedom #> Multiple R-squared: 0.9777, Adjusted R-squared: 0.9761 #> F-statistic: 613.2 on 2 and 28 DF, p-value: < 2.2e-16 6.2 ExploreModelMatrix It’s a Bioconductor package which is useful to understand statistical models we use in differential expression analyses. It is interactive and helps us by creating some visual aids. http://www.bioconductor.org/packages/ExploreModelMatrix For more details, check their paper https://doi.org/10.12688/f1000research.24187.2. We’ll go over the examples they provide at http://www.bioconductor.org/packages/release/bioc/vignettes/ExploreModelMatrix/inst/doc/ExploreModelMatrix.html 6.3 Example 1 ## Load ExploreModelMatrix library("ExploreModelMatrix") ## Example data (sampleData <- data.frame( genotype = rep(c("A", "B"), each = 4), treatment = rep(c("ctrl", "trt"), 4) )) #> genotype treatment #> 1 A ctrl #> 2 A trt #> 3 A ctrl #> 4 A trt #> 5 B ctrl #> 6 B trt #> 7 B ctrl #> 8 B trt ## Let's make the visual aids provided by ExploreModelMatrix vd <- ExploreModelMatrix::VisualizeDesign( sampleData = sampleData, designFormula = ~ genotype + treatment, textSizeFitted = 4 ) ## Now lets plot these images cowplot::plot_grid(plotlist = vd$plotlist) Interactively, we can run the following code: ## We are using shiny again here app <- ExploreModelMatrix( sampleData = sampleData, designFormula = ~ genotype + treatment ) if (interactive()) shiny::runApp(app) 6.4 Example 2 http://bioconductor.org/packages/release/bioc/vignettes/ExploreModelMatrix/inst/doc/ExploreModelMatrix.html#example-2 6.5 Example 3 http://bioconductor.org/packages/release/bioc/vignettes/ExploreModelMatrix/inst/doc/ExploreModelMatrix.html#example-3 6.6 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise 1: Interpret ResponseResistant.Treatmentpre from the second example. It could be useful to take a screenshot and to draw some annotations on it. Exercise 2: Whis is the 0 important at the beginning of the formula in the third example? 6.7 To learn more A guide to creating design matrices for gene expression experiments: http://bioconductor.org/packages/release/workflows/vignettes/RNAseq123/inst/doc/designmatrices.html https://f1000research.com/articles/9-1444 “Model matrix not full rank” http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#model-matrix-not-full-rank 6.8 Community Some of the ExploreModelMatrix authors: https://bsky.app/profile/csoneson.bsky.social https://twitter.com/FedeBioinfo https://twitter.com/mikelove Some of the edgeR and limma authors: https://twitter.com/mritchieau https://twitter.com/davisjmcc https://twitter.com/markrobinsonca https://twitter.com/AliciaOshlack "],["dge-model-building-with-variancepartition.html", "7 DGE model building with variancePartition 7.1 Canonical Correlation Analysis 7.2 Fit model and extract fraction of variance explained 7.3 Examine the expression of most affected genes by each sample variable References", " 7 DGE model building with variancePartition Instructor: Daianna González-Padilla After having processed RNA-seq data and assessed the quality and the variability of the samples the next step for DGE is to explore the variance in the expression of the genes themselves according to sample groups, or in other words, to quantify the contribution of the multiple sample variables in the gene expression variation. To determine which variables are the major drivers of expression variability, and importantly to define if the technical variability of RNA-seq data is low enough to study the condition of interest, we can implement an analysis of variance partition. variancePartition is a package that decomposes for each gene the expression variation into fractions of variance explained (FVE) by the sample variables in the experimental design of high-throughput genomics studies [1]. In order to exemplify how to implement this analysis and the type of conclusions that can be drawn from it, we’ll use bulk RNA-seq data from the smokingMouse package. ## Load the container package for this type of data library("SummarizedExperiment") ## Connect to ExperimentHub library("ExperimentHub") eh <- ExperimentHub::ExperimentHub() ## Load the datasets of the package myfiles <- query(eh, "smokingMouse") ## Download the mouse gene data rse_gene <- myfiles[["EH8313"]] ## Keep samples from nicotine experiment and pups only rse_gene_nic <- rse_gene[ , which(rse_gene$Expt == "Nicotine" & rse_gene$Age == "Pup") ] ## Use expressed genes only (i.e. that passed the filtering step) rse_gene_filt <- rse_gene_nic[ rowData(rse_gene_nic)$retained_after_feature_filtering, ] ## Keep samples that passed QC and manual sample filtering steps (all passed) rse_gene_filt <- rse_gene_filt[ , rse_gene_filt$retained_after_QC_sample_filtering & rse_gene_filt$retained_after_manual_sample_filtering ] 7.1 Canonical Correlation Analysis Prior to the variance partition analysis, evaluating the correlation between sample variables is crucial because highly correlated variables can produce unstable estimates of the variance fractions and impede the identification of the variables that really contribute to the expression variation. There are at least two problems with correlated variables: If two variables are highly correlated we could incorrectly determine that one of them contributes to gene expression changes when it was actually not explanatory but just correlated with a real contributory variable. The part of variance explained by a biologically relevant variable can be reduced by the apparent contributions of correlated variables, if for example, they contain very similar information (i.e. are redundant variables). Additionally, the analysis is better performed with simpler models, specially when we have a limited number of samples in the study. Hence, to remove such variables we must first identify them. We will perform a Canonical Correlation Analysis (CCA) with canCorPairs() that assesses the degree to which the variables co-vary and contain the same information. With CCA, linear combinations that maximize the correlation between variable sets are estimated. CCA is just like a normal correlation analysis between 2 vectors but it can accommodate matrices as well (variable sets). Note that CCA returns correlations values between 0 and 1 [2]. library("variancePartition") library("pheatmap") ## Plot heatmap of correlations ## Define all variables to examine; remove those with single values formula <- ~ Group + Sex + plate + flowcell + mitoRate + overallMapRate + totalAssignedGene + rRNA_rate + sum + detected + ERCCsumLogErr ## Measure correlations CCA <- canCorPairs(formula, colData(rse_gene_filt)) ## Heatmap pheatmap( CCA, ## data color = hcl.colors(50, "YlOrRd", rev = TRUE), ## color scale fontsize = 8, ## text size border_color = "black", ## border color for heatmap cells cellwidth = unit(0.4, "cm"), ## height of cells cellheight = unit(0.4, "cm") ## width of cells ) p.alert { background-color: #FFE4E1; padding: 14px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } ⚠️ Very important: always inspect if there are any correlated variables with the one of interest in your study! This is extremely important as correlated variables could represent confounding factors and/or hinder the detection of significant DE events, thus yielding to misleading results. Importantly, Group is not highly correlated with any other variable in this study, but overallMapRate is correlated with rRNA_rate, library preparation plate, and the sequencing flowcell; sum (library size) and detected (number of expressed genes) are also correlated. For a detailed definition of these variables check here. 📝 Exercise 1: Run a CCA analysis and determine which pairs of variables in your dataset are correlated. Is there any correlated variable with your variable of interest? Depending on your results there’s sometimes convenient to dig a little deeper into the relationship between correlated variables and to analyze these metrics among our control and experimental samples. Let’s work on that! library("ggplot2") library("cowplot") ## Boxplots/Scatterplots/Barplots for each pair of correlated variables corr_plots <- function(sample_var1, sample_var2, sample_color) { ## Define sample colors by variable colors <- list( "Group" = c("Control" = "brown2", "Experimental" = "deepskyblue3"), "Sex" = c("F" = "hotpink1", "M" = "dodgerblue"), "plate" = c("Plate1" = "darkorange", "Plate2" = "lightskyblue", "Plate3" = "deeppink1"), "flowcell" = c( "HKCG7DSXX" = "chartreuse2", "HKCMHDSXX" = "magenta", "HKCNKDSXX" = "turquoise3", "HKCTMDSXX" = "tomato" ) ) data <- colData(rse_gene_filt) ## a) Barplots for categorical variable vs categorical variable if (class(data[, sample_var1]) == "character" & class(data[, sample_var2]) == "character") { ## y-axis label y_label <- paste("Number of samples from each ", sample_var2, sep = "") ## Stacked barplot with counts for 2nd variable plot <- ggplot(data = as.data.frame(data), aes( x = !!rlang::sym(sample_var1), fill = !!rlang::sym(sample_var2) )) + geom_bar(position = "stack") + ## Colors by 2nd variable scale_fill_manual(values = colors[[sample_var2]]) + ## Show sample counts on stacked bars geom_text(aes(label = after_stat(count)), stat = "count", position = position_stack(vjust = 0.5), colour = "gray20", size = 3 ) + theme_bw() + labs( subtitle = paste0("Corr: ", signif(CCA[sample_var1, sample_var2], digits = 3)), y = y_label ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } ## b) Boxplots for categorical variable vs continuous variable else if (class(data[, sample_var1]) == "character" & class(data[, sample_var2]) == "numeric") { plot <- ggplot(data = as.data.frame(data), mapping = aes( x = !!rlang::sym(sample_var1), y = !!rlang::sym(sample_var2), color = !!rlang::sym(sample_var1) )) + geom_boxplot(size = 0.25, width = 0.32, color = "black", outlier.color = NA) + geom_jitter(width = 0.15, alpha = 1, size = 1.5) + stat_smooth(method = "lm", geom = "line", alpha = 0.6, size = 0.4, span = 0.3, aes(group = 1), color = "orangered3") + scale_color_manual(values = colors[[sample_var1]]) + theme_bw() + guides(color = "none") + labs( subtitle = paste0("Corr: ", signif(CCA[sample_var1, sample_var2], digits = 3)), y = gsub("_", " ", sample_var2), x = sample_var1 ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } ## c) Scatterplots for continuous variable vs continuous variable else if (class(data[, sample_var1]) == "numeric" & class(data[, sample_var2]) == "numeric") { plot <- ggplot(as.data.frame(data), aes( x = !!rlang::sym(sample_var1), y = !!rlang::sym(sample_var2), color = !!rlang::sym(sample_color) )) + geom_point(size = 2) + stat_smooth(method = "lm", geom = "line", alpha = 0.6, size = 0.6, span = 0.25, color = "orangered3") + ## Color by sample_color variable scale_color_manual(name = sample_color, values = colors[[sample_color]]) + theme_bw() + labs( subtitle = paste0("Corr: ", signif(CCA[sample_var1, sample_var2], digits = 3)), y = gsub("_", " ", sample_var2), x = gsub("_", " ", sample_var1) ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } return(plot) } As shown below, Group and plate are moderately correlated given that 14 of the 23 (60.8%) control samples and 11 of the 19 (57.9%) exposed samples were in the first and second plate for library preparation, respectively. ## Correlation plot for Group and plate p <- corr_plots("Group", "plate", NULL) p + theme(plot.margin = unit(c(1, 5.5, 1, 5.5), "cm")) We can also observe that even though QC metrics such as overallMapRate and rRNA_rate are correlated, there’s no distinction between control and exposed samples for these variables. ## Correlation plot for overallMapRate and rRNA_rate p <- corr_plots("overallMapRate", "rRNA_rate", "Group") p + theme(plot.margin = unit(c(2, 3.5, 2, 3.5), "cm")) Moreover, the correlation between overallMapRate and the library preparation plate is mainly given by the plate 1 samples that have lower rates, similar to what occurs with the samples from the first flowcell. ## Correlation plot for overallMapRate and plate p <- corr_plots("plate", "overallMapRate", NULL) p + theme(plot.margin = unit(c(2, 5, 2, 5), "cm")) ## Correlation plot for overallMapRate and flowcell p <- corr_plots("flowcell", "overallMapRate", NULL) p + theme(plot.margin = unit(c(2, 5, 2, 5), "cm")) Interestingly, control samples seem to present more expressed genes than exposed samples for a given library size, however none of these variables is correlated with Group. ## Correlation plots for sum and detected p <- corr_plots("sum", "detected", "Group") p + theme(plot.margin = unit(c(2, 3.5, 2, 3.5), "cm")) ❓ Now look at the following plot. Why is it important that experimental and control samples are distributed throughout all sequencing flowcells? p <- corr_plots("Group", "flowcell", NULL) plots <- plot_grid(p) plots + theme(plot.margin = unit(c(0.5, 5, 0.5, 5), "cm")) Hint: What would happen if all experimental samples were in one flowcell and all controls in another? After identifying which variables are correlated and exploring the metrics of control and experimental samples the next is to determine which variable from each pair of correlated variables should be discarded and which one included in the models. How do we discern which ones to keep? As recommended in the variancePartition user’s guide [2], initially we can fit a linear model to the expression data of each gene taking all sample variables and then investigate which ones explain higher percentages of variance for many genes. But first let’s review how variancePartition works. 7.2 Fit model and extract fraction of variance explained Briefly, what variancePartition does is to fit a linear model for each gene separately and to compute the fraction of the total data variance explained by each variable of the study design, as well as by the residuals, using the calcVarPart() function. These computed fractions of variation explained (FVE) summarize the contribution of each variable and naturally sum to 1 [1]. variancePartition fits two types of models: Linear mixed model (LMM) where all categorical variables are modeled as random effects and all continuous variables are fixed effects. The function lmer() from lme4 is used to fit this model. ## Fit LMM specifying the existence of random effects with '(1| )' fit <- lmer(expr ~ a + b + (1|c), data=data) Fixed effects model, which is basically the standard linear model (LM), where all variables are modeled as fixed effects. The function lm() is used to fit this model. ## Fit LM modeling all variables as fixed effects fit <- lm(expr ~ a + b + c, data=data) In our case, the function will be modeled as a mixed model since we have both effects. p.question{ background-color: #E3E3E3; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } ❓ What are random and fixed effects? Categorical variables are usually modeled as random effects, i.e., variables such as flowcell, plate, donor, etc. whose levels are “randomly chosen or selected from a larger population”. These levels are not of interest by themselves but the grouping of the samples by them. Random effects correspond to those variables whose effect on the expression of a gene varies according to its sample groups/levels. On the other hand, continuous variables can be modeled as fixed effects. These are sample-level variables that preserve their impact on the expression of a gene irrespective of the sample. ❓ Why is this effect distinction important? Because when we have clustered data, like gene expression values grouped by sample sex, batch, etc. we are violating the relevant assumption of independence, making an incorrect inference when using a general linear model (GLM). If we have clustered data where the variables’ values have distinct effects on gene expression, we must work with an extension of GLM, i.e. with the linear mixed model (LMM) that contains a mix of both fixed and random effects [3]. Linear mixed model fit 1️⃣ After fitting a linear model to the expression data of each gene we obtain the predicted expression of the genes given by the estimated coefficients of the variables: \\(\\hat y =\\sum_{1}^j\\beta_{j}X_{j} + \\sum_{1}^k\\alpha_{k}Z_{k}\\) ← the expression of a gene across all samples is given by the samples’ values in the \\(j\\) fixed effects and \\(k\\) random effects. Therefore, the gene expression in the sample \\(i\\) is given by \\(\\hat y_i =\\sum_{1}^j\\beta_{j}X_{ij} + \\sum_{1}^k\\alpha_{k}Z_{ik}\\). Then \\(y=\\hat y+\\epsilon\\), which means that the true (observed) expression value is given by the predicted value plus an error term (\\(\\epsilon\\)), also called noise or residual: \\[y =\\sum_{1}^j\\beta_{j}X_{j} + \\sum_{1}^k\\alpha_{k}Z_{k} + \\epsilon\\] \\(X_j\\) is the vector of the values of the samples in the \\(j\\)th fixed effect. \\(\\beta_j\\) is the predicted coefficient of the fixed effect \\(j\\). \\[ X_j\\beta_j= \\ \\ _{n \\ \\ samples}\\stackrel{j^{th}\\ \\ fixed\\ \\ effect }{\\begin{bmatrix} X_{1j} \\\\ ... \\\\ X_{(n-1)j} \\\\ X_{nj} \\end{bmatrix}}\\beta_j = {\\begin{bmatrix} X_{1j}\\beta_j \\\\ ... \\\\ X_{(n-1)j}\\beta_j \\\\ X_{nj} \\beta_j \\end{bmatrix}} \\] \\(Z_k\\) is the vector of values the samples have for the \\(k\\)th random effect. \\(\\alpha_k\\) is the predicted coefficient of the random effect \\(k\\). These are drawn from a normal distribution \\(∼N(0, \\sigma_{\\alpha_k}^2 )\\). \\[ Z_k\\alpha_k= \\ \\ _{n \\ \\ samples}\\stackrel{{k^{th}\\ \\ random\\ \\ effect }}{\\begin{bmatrix} Z_{1k} \\\\ ... \\\\ Z_{(n-1)k}\\\\ Z_{nk} \\end{bmatrix}}\\alpha_k = {\\begin{bmatrix} Z_{1k}\\alpha_k \\\\ ... \\\\ Z_{(n-1)k}\\alpha_k\\\\ Z_{nk}\\alpha_k \\end{bmatrix}} \\] \\(\\epsilon\\) is the noise term which is \\(y-\\hat y\\), the difference between the observed and predicted expression and is also drawn from \\(∼N(0, \\sigma_{\\epsilon}^2 )\\). Expanding, \\[ y= {\\begin{bmatrix} X_{11}\\beta_1 \\\\ ... \\\\ X_{(n-1)1}\\beta_1 \\\\ X_{n1} \\beta_1 \\end{bmatrix}} + {\\begin{bmatrix} X_{12}\\beta_2 \\\\ ... \\\\ X_{(n-1)2}\\beta_2 \\\\ X_{n2} \\beta_2 \\end{bmatrix}} + ... +{\\begin{bmatrix} X_{1j}\\beta_j \\\\ ... \\\\ X_{(n-1)j}\\beta_j \\\\ X_{nj} \\beta_j \\end{bmatrix}} + {\\begin{bmatrix} Z_{11}\\alpha_1 \\\\ ... \\\\ Z_{(n-1)1}\\alpha_1\\\\ Z_{n1}\\alpha_1 \\end{bmatrix}} +{\\begin{bmatrix} Z_{12}\\alpha_2 \\\\ ... \\\\ Z_{(n-1)2}\\alpha_2\\\\ Z_{n2}\\alpha_2 \\end{bmatrix}} \\] \\[ + ... + {\\begin{bmatrix} Z_{1k}\\alpha_k \\\\ ... \\\\ Z_{(n-1)k}\\alpha_k\\\\ Z_{nk}\\alpha_k \\end{bmatrix}} + {\\begin{bmatrix} \\epsilon_1 \\\\ ... \\\\ \\epsilon_{(n-1)} \\\\ \\epsilon_n \\end{bmatrix}} \\] All parameters are estimated with maximum likelihood, the default method in the variancePartition software when random effects are specified because it performs best in simulations. 2️⃣ Then, calcVarPart() computes for each fixed effect \\(\\sum_{i=1}^n(\\beta_{j}X_{ij}-\\bar{\\beta_{j}X_{j}})^2=var(\\beta_{j}X_{j})(n-1)\\), the squared sum of the predicted expression values of a gene in the \\(n\\) samples only taking into account the variable \\(j\\) in the regression model: \\(\\hat y = \\beta_{j}X_{j}\\). Each of these squared sums is scaled by additional factors but to simplify let’s just explain this analysis in terms of the variance (that is proportional to the squared sum): The variance explained by the \\(j\\)th fixed effect is: \\(\\sigma_{\\beta_j}^2=var(X_j{\\beta_j})\\) For random effects the variances are computed by variance component estimates with VarCorr() from nmle: The variance of the \\(k\\)th random effect is \\(\\sigma_{\\alpha_k}^2=var(Z_k{\\alpha_k})\\) The total variance of the expression values is calculated by \\(\\sum_{i=1}^n(y_i - \\bar { y})^2=var(y)(n-1)\\), where \\(y_i = \\sum_{1}^j\\beta_{j}X_{ij} + \\sum_{1}^k\\alpha_{k}Z_{ik} + \\epsilon_i\\) considering all variables in the model and the error: The total variance is: \\(var(y)= \\sigma_{Total}^2= var(X_{1}\\beta_1)+var(X_{2}\\beta_2)+...+var(X_{j}\\beta_j)+var(Z_{1}\\alpha_1)+var(Z_{2}\\alpha_2)+...+var(Z_{k}\\alpha_k)+var(\\epsilon)=\\) \\(\\sum_1^jvar(X_j\\beta_j)+\\sum_1^kvar(Z_k\\alpha_k)+var(\\epsilon)=\\) \\(\\sigma_{Total}^2=\\sum_1^j{ \\sigma_{\\beta_j}^2} + \\sum_1^k{ \\sigma_{\\alpha_k}^2} + \\sigma_{\\epsilon}^2\\) 3️⃣ Finally, it computes: The fraction of the total data variance explained by the \\(j\\)th fixed effect is \\(\\sigma_{\\beta_j}^2\\) / \\(\\sigma_{Total}^2\\) The fraction of the total data variance explained by the \\(k\\)th random effect is \\(\\sigma_{\\alpha_k}^2\\) / \\(\\sigma_{Total}^2\\) Note that \\(y=\\hat y+\\epsilon\\) because the expression can’t be completely described by a straight line, so not all the variation of \\(y\\) can be explained by the variation of the sample variables, instead \\(var(y)=var(\\hat y)+var(\\epsilon)=var(\\hat y) + \\sigma_{\\epsilon}^2\\), where \\(\\sigma_{\\epsilon}^2=\\sum_{i=1}^n(\\hat y_i -y_i)^2/n-1\\). The residual variance is \\(\\sigma_{\\epsilon}^2\\) / \\(\\sigma_{Total}^2\\) ; this is the variance that the model (with the included covariates) couldn’t explain. p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 13px; font-family: sans-serif; } 👉🏼 Source code of calcVarPart() here. Once we have reviewed what variancePartition computes and how, we can use it to quantify the FVE for each variable. ## Fit a linear mixed model (LMM) that takes continuous variables as fixed effects and categorical variables as random effects varPartAnalysis <- function(formula) { ## Ignore genes with variance 0 genes_var_zero <- which(apply(assays(rse_gene_filt)$logcounts, 1, var) == 0) if (length(genes_var_zero) > 0) { rse_gene_filt <- rse_gene_filt[-genes_var_zero, ] } ## Loop over each gene to fit the model and extract variance explained by each variable varPart <- fitExtractVarPartModel(assays(rse_gene_filt)$logcounts, formula, colData(rse_gene_filt)) # Sort variables by median fraction of variance explained (FVE) vp <- sortCols(varPart) p <- plotVarPart(vp) return(list(p, vp)) } In the following violin plots, we have the % of variance explained in the expression of each gene by each covariate, based on the model with all variables. Of our pairs of correlated variables, rRNA_rate has the highest median FVE and thus, should be included in the models for DGE, whereas variables correlated with it (overallMapRate) must be removed. Furthermore, library preparation plate must be excluded as it is correlated with Group. ##### Fit model with all variables ##### # sum, detected, and ERCCsumLogErr are not included as they are in very different scales! formula <- ~ (1 | Group) + (1 | Sex) + (1 | plate) + (1 | flowcell) + mitoRate + overallMapRate + totalAssignedGene + rRNA_rate plot <- varPartAnalysis(formula)[[1]] plot + theme( plot.margin = unit(c(1, 1, 1, 1), "cm"), axis.text.x = element_text(size = (7)), axis.text.y = element_text(size = (7.5)) ) ⚠️ Note that some variables such as the library size and the number of detected genes that are in different orders of magnitude to the rest of the QC metrics and categorical variables are not included in this analysis as they can impact the model predictions and the interpretability of the regression results [4]. These variables can be analyzed only after rescaling. After re-running the analysis without the previous correlated variables, now Group contribution increases but so does the residual source, i.e., the % of gene expression variance that the model couldn’t explain increases, although the increase is rather low. This occurs because when we remove independent variables to a regression equation, we can explain less of the variance of the dependent variable [3]. That’s the price to pay when dropping variables, but it is convenient when we don’t have many samples for the model to determine variable unique contributions. ##### Fit model without correlated variables ##### ## Pup plots without overallMapRate and plate formula <- ~ (1 | Group) + (1 | Sex) + (1 | flowcell) + mitoRate + overallMapRate + totalAssignedGene varPart <- varPartAnalysis(formula) varPart_data <- varPart[[2]] plot <- varPart[[1]] plot + theme( plot.margin = unit(c(1, 1, 1, 1), "cm"), axis.text.x = element_text(size = (7)), axis.text.y = element_text(size = (7.5)) ) 📝 Exercise 2: Perform a variance partition analysis and determine which of your correlated variables have higher contributions in gene expression variance. Based on that, select a set of variables to model gene expression for DGE. But what does it mean that a variable explains a high percentage of the expression variation of a gene? In the following section will visualize the existing relationships between the gene expression values in the samples and the sample-level variables. 7.3 Examine the expression of most affected genes by each sample variable In the plots presented below we can appreciate the expression levels across samples of the most affected genes by each variable, i.e., the genes for which the variable explains the highest percentages of variance, plotted against the sample values for the same variable. Observe the strong correlations that exist for the sample variables and the gene expression of such affected genes, which ends up causing these variables to explain high percentages of gene expression variation and which obligate us to adjust for them in the models. library("rlang") ## Plot of gene expression lognorm counts vs. sample variable plot_gene_expr <- function(sample_var, gene_id) { colors <- list( "Group" = c("Control" = "brown2", "Experimental" = "deepskyblue3"), "Age" = c("Adult" = "slateblue3", "Pup" = "yellow3"), "Sex" = c("F" = "hotpink1", "M" = "dodgerblue"), "Pregnancy" = c("Yes" = "darkorchid3", "No" = "darkolivegreen4"), "plate" = c("Plate1" = "darkorange", "Plate2" = "lightskyblue", "Plate3" = "deeppink1"), "flowcell" = c( "HKCG7DSXX" = "chartreuse2", "HKCMHDSXX" = "magenta", "HKCNKDSXX" = "turquoise3", "HKCTMDSXX" = "tomato" ) ) ## Lognorm counts of the gene across samples data <- colData(rse_gene_filt) data$gene_expr <- assays(rse_gene_filt)$logcounts[gene_id, ] ## Percentage of variance explained by the variable percentage <- 100 * signif(varPart_data[gene_id, sample_var], digits = 3) ## Boxplots for categorical variables if (class(data[, sample_var]) == "character") { plot <- ggplot(data = as.data.frame(data), mapping = aes( x = !!rlang::sym(sample_var), y = gene_expr, color = !!rlang::sym(sample_var) )) + geom_boxplot(size = 0.25, width = 0.32, color = "black", outlier.color = "#FFFFFFFF") + geom_jitter(width = 0.15, alpha = 1, size = 1) + stat_smooth(geom = "line", alpha = 0.6, size = 0.4, span = 0.3, method = "lm", aes(group = 1), color = "orangered3") + scale_color_manual(values = colors[[sample_var]]) + theme_bw() + guides(color = "none") + labs( title = gene_id, subtitle = paste0("Variance explained: ", percentage, "%"), y = "lognorm counts", x = sample_var ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.title = element_text(hjust = 0.5, size = 7.5, face = "bold"), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } ## Scatterplots for continuous variables else { colors <- c( "mitoRate" = "khaki3", "overallMapRate" = "turquoise", "totalAssignedGene" = "plum2", "rRNA_rate" = "orange3", "sum" = "palegreen3", "detected" = "skyblue2", "ERCCsumLogErr" = "slateblue1" ) plot <- ggplot(as.data.frame(data), aes(x = eval(parse_expr(sample_var)), y = gene_expr)) + geom_point(color = colors[[sample_var]], size = 2) + stat_smooth(geom = "line", alpha = 0.4, size = 0.4, span = 0.25, method = "lm", color = "orangered3") + theme_bw() + guides(color = "none") + labs( title = gene_id, subtitle = paste0("Variance explained: ", percentage, "%"), y = "lognorm counts", x = gsub("_", " ", sample_var) ) + theme( plot.margin = unit(c(0.4, 0.1, 0.4, 0.1), "cm"), axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.title = element_text(hjust = 0.5, size = 7.5, face = "bold"), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } return(plot) } ## Function to plot gene expression vs sample variable data for top 3 most affected genes plot_gene_expr_sample <- function(sample_var) { ## Top 3 genes most affected by sample variable affected_genes <- rownames(varPart_data[order(varPart_data[, sample_var], decreasing = TRUE), ][1:3, ]) ## Plots plots <- list() for (i in 1:length(affected_genes)) { plots[[i]] <- plot_gene_expr(sample_var, affected_genes[i]) } plot_grid(plots[[1]], plots[[2]], plots[[3]], ncol = 3) } ## Plots for top affected genes by 'overallMapRate' plots <- plot_gene_expr_sample("overallMapRate") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) ## Plots for top affected genes by 'totalAssignedGene' plots <- plot_gene_expr_sample("totalAssignedGene") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) ## Plots for top affected genes by 'Group' plots <- plot_gene_expr_sample("Group") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) ## Plots for top affected genes by 'Sex' (genes in sexual chrs) plots <- plot_gene_expr_sample("Sex") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) p.exercise { background-color: #FFFAFA; padding: 15px; border: 2px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } 📝 Exercise 3: What % of variance does Group explain for the gene ENSMUSG00000042348.10? Create the boxplots for its counts in control and experimental samples. Is it more likely that the gene is up-regulated or down-regulated? 📝 Exercise 4: Do the same for the gene ENSMUSG00000064372.1. What do you observe in terms of variance percentage and sample differences? References Hoffman, G. E., & Schadt, E. E. (2016). variancePartition: interpreting drivers of variation in complex gene expression studies.BMC bioinformatics, 17(1), 1-13. Hoffman, G. (2022). variancePartition: Quantifying and interpreting drivers of variation in multilevel gene expression experiments. van den Berg, S. M. (2022). Analysing data using linear models. Web site: https://bookdown.org/pingapang9/linear_models_bookdown/ Simoiu, C. & Savage, J. (2016). A bag of tips and tricks for dealing with scale issues. Web site: https://rpubs.com/jimsavage/scale_issues "],["differential-gene-expression-exercise.html", "8 Differential gene expression exercise 8.1 Recap 8.2 Exercise", " 8 Differential gene expression exercise Instructor: Leo 8.1 Recap So far we know how to: choose a study from recount3 download data for a study with recount3::create_rse() explore the data interactively with iSEE expand Sequence Read Archive (SRA) attributes sometimes we need to clean them up a bit before we can use them use edgeR::calcNormFactors() to reduce composition bias We didn’t show it explicitly this time build a differential gene expression model with model.matrix() explore and interpret the model with ExploreModelMatrix use limma::voom() and related functions to compute the differential gene expression statistics extract the DEG statistics with limma::topTable(sort.by = \"none\") among several other plots and tools we learned along the way. Alternatively to recount3, we have learned about the RangedSummarizedExperiment objects produced by SPEAQeasy and in particular the one we are using on the smokingMouse project. You might have your own data already. Maybe you have it as an AnnData python object. If so, you can convert it to R with zellkonverter. 8.2 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise option 1: This will be an open ended exercise. Think of it as time to practice what we’ve learnt using data from recount3 or another subset of the smokingMouse dataset. You could also choose to re-run code from earlier parts of the course and ask clarifying questions. You could also use this time to adapt some of the code we’ve covered to use it with your own dataset. If you prefer a more structured exercise: Exercise option 2: Choose two recount3 studies that can be used to study similar research questions. For example, two studies with brain samples across age. Download and process each dataset independently, up to the point where you have differential expression t-statistics for both. Skip most of the exploratory data analyses steps as for the purpose of this exercise, we are most interested in the DEG t-statistics. If you don’t want to choose another recount3 study, you could use the smokingMouse data and subset it once to the pups in nicotine arm of the study and a second time for the pups in the smoking arm of the study. Or you could use the GTEx brain data from recount3, subset it to the prefrontal cortex (PFC), and compute age related expression changes. That would be in addition to SRA study SRP045638 as was showcased in the 2023 version of this course. recount3::create_rse_manual( project = "BRAIN", project_home = "data_sources/gtex", organism = "human", annotation = "gencode_v26", type = "gene" ) Make a scatterplot of the t-statistics between the two datasets to assess correlation / concordance. You might want to use GGally::ggpairs() for this https://ggobi.github.io/ggally/reference/ggpairs.html. Or ggpubr::ggscatter() https://rpkgs.datanovia.com/ggpubr/reference/ggscatter.html. For example, between the GTEx PFC data and the data from SRA study SRP045638 provided by recount3. Or between the nicotine-exposed pups and the smoking-exposed pups in smokingMouse. Or using the two recount3 studies you chose. Are there any DEGs FDR < 5% in both datasets? Or FDR < 5% in dataset 1 that have a p-value < 5% in the other one? You could choose to make a concordance at the top plot like at http://leekgroup.github.io/recount-analyses/example_de/recount_SRP019936.html, though you will likely need more time to complete this. "],["research-talks.html", "9 Research talks 9.1 Fentanyl rat study 9.2 Cg Hb cell projectors study 9.3 deconvolution-benchmark", " 9 Research talks 9.1 Fentanyl rat study Daianna 9.2 Cg Hb cell projectors study Melissa 9.3 deconvolution-benchmark Leonardo Here’s Louise A. Huuki-Myers LIBD seminar if you prefer to watch the recording. "],["biocthis-introduction.html", "10 biocthis introduction 10.1 Related past materials 10.2 biocthis main commands 10.3 Live demo 10.4 Community", " 10 biocthis introduction Instructor: Leo 10.1 Related past materials I’ve taught a lot about biocthis over the years. Here’s a 2020 video: and more recently, these are the LIBD rstats club 2023-03-10 notes. 10.2 biocthis main commands https://bioconductor.org/packages/biocthis pkgdown documentation website: https://lcolladotor.github.io/biocthis/ biocthis::use_bioc_pkg_templates() documentation: https://lcolladotor.github.io/biocthis/reference/use_bioc_pkg_templates.html These are the main steps you will need to know to make a Bioconductor package with biocthis: You first will need to create a package using a command from usethis. For example: usethis::create_package(\"~/Desktop/cshl2024pkg\") Now that you have a package, we can use biocthis to create 4 template R scripts that will guide you and help you make the full structure for a Bioconductor R package. On your new R package (cshl2024pkg), we can now use biocthis::use_bioc_pkg_templates(). In part these commands were born out of my own self interest to make it easier to make new packages instead of copy-pasting the contents of an older one, then manually adjusting all the pieces for a new package. See https://lcolladotor.github.io/pkgs/ for the list of all the R packages I’ve been involved in. 10.3 Live demo Here is the live demo result https://github.com/lcolladotor/cshl2024pkg/ with its companion documentation website at https://lcolladotor.github.io/cshl2024pkg/. You might also want to check the 2023 version at https://github.com/lcolladotor/cshl2024pkg/. Check the git commit history at https://github.com/lcolladotor/cshl2024pkg/commits/devel and the GitHub Actions history at https://github.com/lcolladotor/cshl2024pkg/actions. We can see at https://app.codecov.io/gh/lcolladotor/cshl2024pkg the code coverage results for this demonstration package. 10.3.1 Example function Let’s have a function to work with: weekday_praise(). weekday_praise <- function(date = Sys.Date()) { date <- as.Date(date) date_weekday <- weekdays(date) paste0(date_weekday, ": ", praise::praise()) } weekday_praise() #> [1] "Monday: You are exquisite!" weekday_praise("2024-06-09") #> [1] "Sunday: You are brilliant!" Here’s the full code for the function and its documentation. #' Praise a weekday #' #' Given a date, figure out which weekday it was, then write a positive #' message. #' #' @param date A `base::Date` object or a `character()` in a format that can be #' converted to a `base::Date` object with `base::as.Date()`. #' #' @importFrom praise praise #' @export #' @examples #' #' ## Praise the current weekday #' weekday_praise() #' #' ## Praise the date we started teaching #' weekday_praise("2024-06-09") #' #' ## Praise the current weekday in a reproducible way #' set.seed(20240610) #' weekday_praise() #' #' ## Verify that it's reproducible #' set.seed(20240610) #' weekday_praise() weekday_praise <- function(date = Sys.Date()) { date <- as.Date(date) date_weekday <- weekdays(date) paste0(date_weekday, ": ", praise::praise()) } Here’s a test for our function too. library("testthat") #> #> Attaching package: 'testthat' #> The following objects are masked from 'package:rlang': #> #> is_false, is_null, is_true #> The following object is masked from 'package:Hmisc': #> #> describe ## Verify that we get the result we wanted set.seed(20240610) expect_equal(weekday_praise("2024-06-09"), "Sunday: You are wondrous!") ## Verify that we get an error if the input is not correct expect_error(weekday_praise("240609")) ## Should work for a vector input expect_equal(length(weekday_praise(c("2024-06-09", "2024-06-10"))), 2L) 10.4 Community For more materials on R/Bioconductor package development check http://contributions.bioconductor.org/. I’m on a Friday night mood now enjoying @lmwebr’s #OSTA workshop 🔥, feeling grateful 🙏🏽 to everyone who nominated me for the #BioC2021 community award 🥇& celebrating 🍺 https://t.co/2oFLdGO3UhSee you in #BioC2022🤞🏽 @Bioconductor #rstats @CDSBMexico https://t.co/0SGHDfiRCs pic.twitter.com/UmM9nMP2W2 — 🇲🇽 Leonardo Collado-Torres (@lcolladotor) August 7, 2021 biocthis is one of the reasons for my 2021 Bioconductor community award :-) Do you want to play an active role? Join the cloud-working-group Slack channel. "],["scrna-seq-data-analysis-overview.html", "11 scRNA-seq data analysis overview 11.1 Single cell RNA sequencing 11.2 Basic Workflow 11.3 The SingleCellExperiment class 11.4 Quality Control 11.5 Normalization 11.6 Feature selection 11.7 Dimensionality reduction 11.8 Clustering 11.9 Marker gene detection 11.10 Cell type annotation", " 11 scRNA-seq data analysis overview Instructor: Melissa Mayén Quiroz Adapted from: OSCA: Basics of Single-Cell Analysis with Bioconductor 11.1 Single cell RNA sequencing Single-cell RNA sequencing (scRNA-seq) is a cutting-edge technology used to analyze the gene expression profiles of individual cells. Unlike traditional bulk RNA sequencing, which provides an average expression profile of a population of cells, scRNA-seq allows researchers to study the gene expression patterns of single cells. Cell heterogeneity Cell type identification Cell state dynamics Orchestrating Single-Cell Analysis with Bioconductor Authors: Robert Amezquita [aut], Aaron Lun [aut], Stephanie Hicks [aut], Raphael Gottardo [aut], Alan O’Callaghan [cre] 11.1.1 Pre-processing of scRNA-seq Data (Before R) Quality Control of the reads (FastQC): Assess the quality of raw sequencing reads. Check GC content, overrepresented sequences, presence of N bases, and other quality metrics. Alignment to Reference Transcriptome: Align sequencing reads to a reference transcriptome. Generate aligned read files. Generation of Expression Count Matrix: Quantify gene expression levels by counting the number of reads mapped to each gene. Create a matrix with genes as rows and cells as columns, where each entry represents the count of reads for a specific gene in a specific cell. For 10x Genomics data, the Cellranger software suite (Zheng et al. 2017) provides a custom pipeline to obtain a count matrix. This uses STAR to align reads to the reference genome and then counts the number of unique UMIs mapped to each gene. 11.1.2 Different Technologies Droplet-based: 10x Genomics, inDrop, Drop-seq Plate-based with unique molecular identifiers (UMIs): CEL-seq, MARS-seq Plate-based with reads: Smart-seq2 Other: sci-RNA-seq, Seq-Well In practical terms, droplet-based technologies are the current de facto standard due to their throughput and low cost per cell. Plate-based methods can capture other phenotypic information (e.g., morphology) and are more amenable to customization. Read-based methods provide whole-transcript coverage, which is useful in some applications (e.g., splicing, exome mutations); otherwise, UMI-based methods are more popular as they mitigate the effects of PCR amplification noise. 11.2 Basic Workflow In the simplest case, the workflow has the following form: We compute quality control metrics to remove low-quality cells that would interfere with downstream analyses. These cells may have been damaged during processing or may not have been fully captured by the sequencing protocol. Common metrics includes the total counts per cell, the proportion of spike-in or mitochondrial reads and the number of detected features. We convert the counts into normalized expression values to eliminate cell-specific biases (e.g., in capture efficiency). This allows us to perform explicit comparisons across cells in downstream steps like clustering. We also apply a transformation, typically log, to adjust for the mean-variance relationship. We perform feature selection to pick a subset of interesting features for downstream analysis. This is done by modelling the variance across cells for each gene and retaining genes that are highly variable. The aim is to reduce computational overhead and noise from uninteresting genes. We apply dimensionality reduction to compact the data and further reduce noise. Principal components analysis is typically used to obtain an initial low-rank representation for more computational work, followed by more aggressive methods like t-stochastic neighbor embedding for visualization purposes. We cluster cells into groups according to similarities in their (normalized) expression profiles. This aims to obtain groupings that serve as empirical proxies for distinct biological states. We typically interpret these groupings by identifying differentially expressed marker genes between clusters. 11.3 The SingleCellExperiment class This object is specifically designed to store and analyze single-cell RNA sequencing (scRNA-seq) data. It extends the SummarizedExperiment class to include specialized features for single-cell data, such as cell identifiers, dimensionality reduction results, and methods for quality control and normalization. Assay Data: The primary data matrix containing gene expression values or other measurements. Rows represent genes and columns represent cells. colData (Column Metadata): Additional information about each cell, such as cell type, experimental condition, or any other relevant metadata. rowData (Row Metadata): Additional information about each gene, such as gene symbols, genomic coordinates, or functional annotations. reducedDims: Dimensionality reduction results, such as “principal component analysis” (PCA), “t-distributed stochastic neighbor embedding” (t-SNE), and “Uniform Manifold Approximation and Projection” (UMAP), used for visualizing and clustering cells. altExpNames and altExps: Names of alternative experiments (such as spike-in control genes used for normalization) and alternative experiment counts matrices. metadata: Additional metadata about the experiment. 11.3.1 Data Loading The Lun et al. (2017) dataset contains two 96-well plates of 416B cells (an immortalized mouse myeloid progenitor cell line), processed using the Smart-seq2 protocol (Picelli et al. 2014). A constant amount of spike-in RNA from the External RNA Controls Consortium (ERCC) was also added to each cell’s lysate prior to library preparation. library("scRNAseq") library("SingleCellExperiment") library("AnnotationHub") library("scater") ## Load the data set sce.416b <- LunSpikeInData(which = "416b") #> downloading 1 resources #> retrieving 1 resource #> loading from cache #> require("ensembldb") ## We convert the blocking factor to a factor so that downstream steps do not treat it as an integer. sce.416b$block <- factor(sce.416b$block) ## rename the rows with the symbols, reverting to Ensembl identifiers ens.mm.v97 <- AnnotationHub()[["AH73905"]] #> loading from cache rowData(sce.416b)$ENSEMBL <- rownames(sce.416b) rowData(sce.416b)$SYMBOL <- mapIds(ens.mm.v97, keys = rownames(sce.416b), keytype = "GENEID", column = "SYMBOL" ) #> Warning: Unable to map 563 of 46604 requested IDs. rowData(sce.416b)$SEQNAME <- mapIds(ens.mm.v97, keys = rownames(sce.416b), keytype = "GENEID", column = "SEQNAME" ) #> Warning: Unable to map 563 of 46604 requested IDs. rownames(sce.416b) <- uniquifyFeatureNames( rowData(sce.416b)$ENSEMBL, rowData(sce.416b)$SYMBOL ) 11.3.2 Basics of your SCE ## Look at your SCE sce.416b #> class: SingleCellExperiment #> dim: 46604 192 #> metadata(0): #> assays(1): counts #> rownames(46604): 4933401J01Rik Gm26206 ... CAAA01147332.1 CBFB-MYH11-mcherry #> rowData names(4): Length ENSEMBL SYMBOL SEQNAME #> colnames(192): SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 ... #> SLX-11312.N712_S508.H5H5YBBXX.s_8.r_1 SLX-11312.N712_S517.H5H5YBBXX.s_8.r_1 #> colData names(8): cell line cell type ... spike-in addition block #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(2): ERCC SIRV ## Get in the slot "assay", in the count matrix ## [genes, cells] assay(sce.416b, "counts")[110:113, 1:2] # gene, cell #> 4 x 2 sparse Matrix of class "dgCMatrix" #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 #> 1700034P13Rik . . #> Sgk3 8 . #> Gm6195 2 3 #> Gm22607 . . ## We can do it like this too counts(sce.416b)[110:113, 1:2] #> 4 x 2 sparse Matrix of class "dgCMatrix" #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 #> 1700034P13Rik . . #> Sgk3 8 . #> Gm6195 2 3 #> Gm22607 . . ## We could add more assays to our SCE sce.416b <- logNormCounts(sce.416b) sce.416b #> class: SingleCellExperiment #> dim: 46604 192 #> metadata(0): #> assays(2): counts logcounts #> rownames(46604): 4933401J01Rik Gm26206 ... CAAA01147332.1 CBFB-MYH11-mcherry #> rowData names(4): Length ENSEMBL SYMBOL SEQNAME #> colnames(192): SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 ... #> SLX-11312.N712_S508.H5H5YBBXX.s_8.r_1 SLX-11312.N712_S517.H5H5YBBXX.s_8.r_1 #> colData names(9): cell line cell type ... block sizeFactor #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(2): ERCC SIRV ## Acces to the column names (cell identifyers) head(colnames(sce.416b)) #> [1] "SLX-9555.N701_S502.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S503.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S504.C89V9ANXX.s_1.r_1" #> [4] "SLX-9555.N701_S505.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S506.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S507.C89V9ANXX.s_1.r_1" ## Acces to the column data (cell information) head(colData(sce.416b)) #> DataFrame with 6 rows and 9 columns #> cell line cell type single cell well quality genotype #> <character> <character> <character> <character> #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> SLX-9555.N701_S504.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> SLX-9555.N701_S505.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> phenotype strain spike-in addition block sizeFactor #> <character> <character> <character> <factor> <numeric> #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 wild type phenotype B6D2F1-J ERCC+SIRV 20160113 0.742741 #> SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 wild type phenotype B6D2F1-J ERCC+SIRV 20160113 0.923157 #> SLX-9555.N701_S504.C89V9ANXX.s_1.r_1 wild type phenotype B6D2F1-J ERCC+SIRV 20160113 1.012242 #> SLX-9555.N701_S505.C89V9ANXX.s_1.r_1 induced CBFB-MYH11 o.. B6D2F1-J ERCC+SIRV 20160113 1.151585 #> [ reached getOption("max.print") -- omitted 2 rows ] ## Acces to the row names (gene names) head(rownames(sce.416b)) #> [1] "4933401J01Rik" "Gm26206" "Xkr4" "Gm18956" "Gm37180" "Gm37363" ## Acces to the row data (gene information) head(rowData(sce.416b)) #> DataFrame with 6 rows and 4 columns #> Length ENSEMBL SYMBOL SEQNAME #> <integer> <character> <character> <character> #> 4933401J01Rik 1070 ENSMUSG00000102693 4933401J01Rik 1 #> Gm26206 110 ENSMUSG00000064842 Gm26206 1 #> Xkr4 6094 ENSMUSG00000051951 Xkr4 1 #> Gm18956 480 ENSMUSG00000102851 Gm18956 1 #> Gm37180 2819 ENSMUSG00000103377 Gm37180 1 #> Gm37363 2233 ENSMUSG00000104017 Gm37363 1 ## We can create another SCE subsetitng the first one sce_2 <- sce.416b[110:130, 1:2] sce_2 #> class: SingleCellExperiment #> dim: 21 2 #> metadata(0): #> assays(2): counts logcounts #> rownames(21): 1700034P13Rik Sgk3 ... Gm38005 Gm15604 #> rowData names(4): Length ENSEMBL SYMBOL SEQNAME #> colnames(2): SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 #> colData names(9): cell line cell type ... block sizeFactor #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(2): ERCC SIRV As in the SummarizedExperiment, $ is the operator used to access a specific column within the cell metadata. That is, it’s a shortcut for colData(obj)$. head(sce.416b$`cell type`) #> [1] "embryonic stem cell" "embryonic stem cell" "embryonic stem cell" "embryonic stem cell" "embryonic stem cell" #> [6] "embryonic stem cell" Now, we will look at the dimension reductions ## This is empty reducedDimNames(sce_2) #> character(0) ## Compute PCA sce_2 <- runPCA(sce_2) #> Warning in check_numbers(k = k, nu = nu, nv = nv, limit = min(dim(x)) - : more singular values/vectors requested than #> available #> Warning in (function (A, nv = 5, nu = nv, maxit = 1000, work = nv + 7, reorth = TRUE, : You're computing too large a #> percentage of total singular values, use a standard svd instead. ## Check again reducedDimNames(sce_2) #> [1] "PCA" 11.4 Quality Control Low-quality libraries in scRNA-seq data can arise from a variety of sources such as cell damage during dissociation or failure in library preparation (e.g., inefficient reverse transcription or PCR amplification). These usually manifest as “cells” with low total counts, few expressed genes and high mitochondrial or spike-in proportions. These low-quality libraries are problematic as they can contribute to misleading results in downstream analyses. 11.4.1 Common choices of QC metrics For each cell, we calculate these QC metrics using the perCellQCMetrics() function from the scater package (McCarthy et al. 2017). The sum column contains the total count for each cell and the detected column contains the number of detected genes. The subsets_Mito_percent column contains the percentage of reads mapped to mitochondrial transcripts. Finally, the altexps_ERCC_percent column contains the percentage of reads mapped to ERCC transcripts. library("scuttle") ## Identify mitochondrial genes (those with SEQNAME equal to "MT") in the row data mito <- which(rowData(sce.416b)$SEQNAME == "MT") ## Compute per-cell QC metrics, including a subset for mitochondrial genes stats <- perCellQCMetrics(sce.416b, subsets = list(Mt = mito)) summary(stats$sum) # total library sizes for all cells #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 27084 856350 1111252 1165865 1328301 4398883 summary(stats$detected) # detected features (genes) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 5609 7502 8341 8397 9208 11380 summary(stats$subsets_Mt_percent) # percentage of reads mapping to mitochondrial genes #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 4.593 7.294 8.139 8.146 9.035 15.617 summary(stats$altexps_ERCC_percent) # percentage of reads mapping to spike-in controls #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 2.242 4.291 6.031 6.412 8.126 19.429 Alternatively, users may prefer to use the addPerCellQC() function. This computes and appends the per-cell QC statistics to the colData of the SingleCellExperiment object, allowing us to retain all relevant information in a single object for later manipulation. ## Compute addPerCellQCMetrics, including a subset for mitochondrial genes sce.416b <- addPerCellQCMetrics(sce.416b, subsets = list(Mito = mito)) colnames(colData(sce.416b)) #> [1] "cell line" "cell type" "single cell well quality" "genotype" #> [5] "phenotype" "strain" "spike-in addition" "block" #> [9] "sizeFactor" "sum" "detected" "subsets_Mito_sum" #> [13] "subsets_Mito_detected" "subsets_Mito_percent" "altexps_ERCC_sum" "altexps_ERCC_detected" #> [17] "altexps_ERCC_percent" "altexps_SIRV_sum" "altexps_SIRV_detected" "altexps_SIRV_percent" #> [21] "total" A key assumption here is that the QC metrics are independent of the biological state of each cell. Poor values (e.g., low library sizes, high mitochondrial proportions) are presumed to be driven by technical factors rather than biological processes, meaning that the subsequent removal of cells will not misrepresent the biology in downstream analyses. 11.4.2 Identifying low-quality cells 11.4.2.1 With fixed thresholds The simplest approach to identifying low-quality cells involves applying fixed thresholds to the QC metrics. For example, we might consider cells to be low quality if they have library sizes below 100,000 reads; express fewer than 5,000 genes; have spike-in proportions above 10%; or have mitochondrial proportions above 10%. ## Using our previous perCellQCMetrics data: ## Identify cells with a total library size (sum of counts) less than 100,000 c.lib <- stats$sum < 1e5 ## Identify cells with fewer than 5,000 detected features (genes) qc.nexprs <- stats$detected < 5e3 ## Identify cells with more than 10% of reads mapping to spike-in controls (e.g., ERCC) qc.spike <- stats$altexps_ERCC_percent > 10 ## Identify cells with more than 10% of reads mapping to mitochondrial genes qc.mito <- stats$subsets_Mt_percent > 10 ## Create a combined logical vector that marks cells to discard if they meet any of the above criteria discard <- c.lib | qc.nexprs | qc.spike | qc.mito ## Summarize the number of cells removed for each reason. DataFrame( LibSize = sum(c.lib), # Number of cells removed due to low library size NExprs = sum(qc.nexprs), # Number of cells removed due to low number of detected features SpikeProp = sum(qc.spike), # Number of cells removed due to high spike-in proportion MitoProp = sum(qc.mito), # Number of cells removed due to high mitochondrial proportion Total = sum(discard) # Total number of cells removed ) #> DataFrame with 1 row and 5 columns #> LibSize NExprs SpikeProp MitoProp Total #> <integer> <integer> <integer> <integer> <integer> #> 1 3 0 19 14 33 While simple, this strategy requires considerable experience to determine appropriate thresholds for each experimental protocol and biological system. Thresholds for read count-based data are not applicable for UMI-based data, and vice versa. Differences in mitochondrial activity or total RNA content require constant adjustment of the mitochondrial and spike-in thresholds, respectively, for different biological systems. Indeed, even with the same protocol and system, the appropriate threshold can vary from run to run due to the vagaries of cDNA capture efficiency and sequencing depth per cell. 11.4.2.2 With adaptive threshold Here, we assume that most of the dataset consists of high-quality cells. We then identify cells that are outliers for the various QC metrics, based on the median absolute deviation (MAD) from the median value of each metric across all cells. By default, we consider a value to be an outlier if it is more than 3 MADs from the median in the “problematic” direction. We can do that using the perCellQCFilters() function. It will allow to identify cells with log-transformed library sizes that are more than 3 MADs below the median. A log-transformation is used to improve resolution at small values when type = \"lower\" and to avoid negative thresholds that would be meaningless for a non-negative metric. perCellQCFilters() will also identify outliers for the proportion-based metrics specified in the sub.fields= arguments. These distributions frequently exhibit a heavy right tail, but unlike the two previous metrics, it is the right tail itself that contains the putative low-quality cells. Thus, we do not perform any transformation to shrink the tail - rather, our hope is that the cells in the tail are identified as large outliers. A cell that is an outlier for any of these metrics is considered to be of low quality and discarded. This is captured in the discard column, which can be used for later filtering ## Identify cells that are outlier reasons <- perCellQCFilters(stats, sub.fields = c("subsets_Mt_percent", "altexps_ERCC_percent") ) # No transformation colSums(as.matrix(reasons)) #> low_lib_size low_n_features high_subsets_Mt_percent high_altexps_ERCC_percent #> 4 0 2 1 #> discard #> 6 ## Extract the exact filter thresholds attr(reasons$low_lib_size, "thresholds") #> lower higher #> 434082.9 Inf attr(reasons$low_n_features, "thresholds") #> lower higher #> 5231.468 Inf With this strategy, the thresholds adapt to both the location and spread of the distribution of values for a given metric. This allows the QC procedure to adjust to changes in sequencing depth, cDNA capture efficiency, mitochondrial content, etc. without requiring any user intervention or prior experience. However, the underlying assumption of a high-quality majority may not always be appropriate 11.4.3 Checking diagnostic plots It is good practice to inspect the distributions of QC metrics to identify possible problems. In the most ideal case, we would see normal distributions that would justify the 3 MAD threshold used in outlier detection. A large proportion of cells in another mode suggests that the QC metrics might be correlated with some biological state, potentially leading to the loss of distinct cell types during filtering; or that there were inconsistencies with library preparation for a subset of cells, a not-uncommon phenomenon in plate-based protocols. library("scater") ## Add the information to the SCE columns colData(sce.416b) <- cbind(colData(sce.416b), stats) sce.416b$block <- factor(sce.416b$block) sce.416b$phenotype <- ifelse(grepl("induced", sce.416b$phenotype), "induced", "wild type") sce.416b$discard <- reasons$discard ## Plot gridExtra::grid.arrange( ## Diccard low total counts plotColData(sce.416b, x = "block", y = "sum", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + scale_y_log10() + ggtitle("Total count"), ## Discard low detected genes plotColData(sce.416b, x = "block", y = "detected", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + scale_y_log10() + ggtitle("Detected features"), ## Discard high mitocondrial percentage plotColData(sce.416b, x = "block", y = "subsets_Mito_percent", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + ggtitle("Mito percent"), ## Discard high plotColData(sce.416b, x = "block", y = "altexps_ERCC_percent", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + ggtitle("ERCC percent"), ncol = 1 ) You can also create some plots via iSEE :) p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Optional:: Create at least 1 QC plot using iSEE. Clue: Use the Column Data Plot 1 panel library("iSEE") iSEE(sce.416b) 11.4.4 Removing low-quality cells Once low-quality cells have been identified, we can choose to either remove them or mark them. Removal is the most straightforward option and is achieved by subsetting the SingleCellExperiment by column. In this case, we use the previous low-quality calls to generate a subsetted SingleCellExperiment that we would use for downstream analyses. ## Keep the columns we DON'T want to discard. filtered <- sce.416b[, !reasons$discard] Other option is to simply mark the low-quality cells as such and retain them in the downstream analysis. 11.5 Normalization Systematic differences in sequencing coverage between libraries are often observed in single-cell RNA sequencing data which typically arise from technical differences in cDNA capture or PCR amplification efficiency across cells, attributable to the difficulty of achieving consistent library preparation. Normalization aims to remove these differences such that they do not interfere with comparisons of the expression profiles between cells. This will ensure that any observed heterogeneity or differential expression within the cell population are driven by biology and not technical biases. Let´s load before another dataset and review quickly what we have learned. library("scRNAseq") library("scater") ## Load dataset sce.zeisel <- ZeiselBrainData() sce.zeisel <- aggregateAcrossFeatures(sce.zeisel, ids = sub("_loc[0-9]+$", "", rownames(sce.zeisel)) ) ## Compute perCellQCMetrics stats <- perCellQCMetrics(sce.zeisel, subsets = list( Mt = rowData(sce.zeisel)$featureType == "mito" )) ## Compute quickPerCellQC qc <- quickPerCellQC(stats, percent_subsets = c( "altexps_ERCC_percent", "subsets_Mt_percent" )) ## Discard low quality cells sce.zeisel <- sce.zeisel[, !qc$discard] Scaling normalization Scaling normalization is the simplest and most commonly used class of normalization strategies. This involves dividing all counts for each cell by a cell-specific scaling factor, often called a “size factor” (Anders and Huber 2010). The assumption here is that any cell-specific bias (e.g., in capture or amplification efficiency) affects all genes equally via scaling of the expected mean count for that cell. The size factor for each cell represents the estimate of the relative bias in that cell, so division of its counts by its size factor should remove that bias. 11.5.1 Library size normalization Library size normalization is the simplest strategy for performing scaling normalization. We define the library size as the total sum of counts across all genes for each cell, the expected value of which is assumed to scale with any cell-specific biases. The “library size factor” for each cell is then directly proportional to its library size where the proportionality constant is defined such that the mean size factor across all cells is equal to 1. This definition ensures that the normalized expression values are on the same scale as the original counts, which is useful for interpretation (especially when dealing with transformed data). library("scater") ## Compute librarySizeFactors lib.sf.zeisel <- librarySizeFactors(sce.zeisel) summary(lib.sf.zeisel) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.1757 0.5680 0.8680 1.0000 1.2783 4.0839 In the Zeisel brain data, the library size factors differ by up to 10-fold across cells. This is typical of the variability in coverage in scRNA-seq data. ## Plot the library size factors differences hist(log10(lib.sf.zeisel), xlab = "Log10[Size factor]", col = "grey80") Strictly speaking, the use of library size factors assumes that there is no “imbalance” in the differentially expressed (DE) genes between any pair of cells. Although, in practice, normalization accuracy is not a major consideration for exploratory scRNA-seq data analyses. Composition biases do not usually affect the separation of clusters, only the magnitude - and to a lesser extent, direction - of the log-fold changes between clusters or cell types 11.5.2 Normalization by deconvolution composition biases will be present when any unbalanced differential expression exists between samples. Consider the simple example of two cells where a single gene “X” is upregulated in one cell “A” compared to the other cell “B”. This upregulation means that either more sequencing resources are devoted to “X in”A”, thus decreasing coverage of all other non-DE genes when the total library size of each cell is experimentally fixed; or the library size of “A” increases when “X” is assigned more reads or UMIs. The removal of composition biases is a well-studied problem for bulk RNA sequencing data analysis. - estimateSizeFactorsFromMatrix() function in the DESeq2 package (Anders and Huber 2010; Love, Huber, and Anders 2014) - calcNormFactors()function in the edgeR package (Robinson and Oshlack 2010). Single-cell data can be problematic for these bulk normalization methods due to the dominance of low and zero counts. To overcome this, we pool counts from many cells to increase the size of the counts for accurate size factor estimation (Lun, Bach, and Marioni 2016). Pool-based size factors are then “deconvolved” into cell-based factors for normalization of each cell’s expression profile. This is performed using the calculateSumFactors() function from scran. First we have a pre-clustering step with quickCluster() where cells in each cluster are normalized separately and the size factors are rescaled to be comparable across clusters. This avoids the assumption that most genes are non-DE across the entire population - only a non-DE majority is required between pairs of clusters, which is a weaker assumption for highly heterogeneous populations. library("scran") ## Compute quickCluster + calculateSumFactor for deconvolution normalization set.seed(100) clust.zeisel <- quickCluster(sce.zeisel) table(clust.zeisel) #> clust.zeisel #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 #> 170 254 441 178 393 148 219 240 189 123 112 103 135 111 deconv.sf.zeisel <- calculateSumFactors(sce.zeisel, clusters = clust.zeisel) summary(deconv.sf.zeisel) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.1186 0.4860 0.8314 1.0000 1.3209 4.5090 11.5.3 Normalization by spike-ins Spike-in normalization is based on the assumption that the same amount of spike-in RNA was added to each cell, so, systematic differences in the coverage of the spike-in transcripts can only be due to cell-specific biases, e.g., in capture efficiency or sequencing depth. To remove these biases, we equalize spike-in coverage across cells by scaling with “spike-in size factors”. Compared to the previous methods, spike-in normalization requires no assumption about the biology of the system. Practically, spike-in normalization should be used if differences in the total RNA content of individual cells are of interest and must be preserved in downstream analyses. To demonstrate the use of spike-in normalization on a different dataset involving T cell activation after stimulation with T cell receptor ligands of varying affinity (Richard et al. 2018). library("scRNAseq") sce.richard <- RichardTCellData() #> loading from cache sce.richard <- sce.richard[, sce.richard$`single cell quality` == "OK"] sce.richard #> class: SingleCellExperiment #> dim: 46603 528 #> metadata(0): #> assays(1): counts #> rownames(46603): ENSMUSG00000102693 ENSMUSG00000064842 ... ENSMUSG00000096730 ENSMUSG00000095742 #> rowData names(0): #> colnames(528): SLX-12611.N701_S502. SLX-12611.N702_S502. ... SLX-12612.i712_i522. SLX-12612.i714_i522. #> colData names(13): age individual ... stimulus time #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(1): ERCC We apply the computeSpikeFactors() method to estimate spike-in size factors for all cells. This is defined by converting the total spike-in count per cell into a size factor, using the same reasoning as in librarySizeFactors(). (Scaling will subsequently remove any differences in spike-in coverage across cells). ## computeSpikeFactors() to estimate spike-in size factors sce.richard <- computeSpikeFactors(sce.richard, "ERCC") summary(sizeFactors(sce.richard)) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.1247 0.4282 0.6274 1.0000 1.0699 23.3161 11.5.4 Scaling and log-transforming Once we have computed the size factors, we use the logNormCounts() function from scater to compute normalized expression values for each cell. This is done by dividing the count for each gene/spike-in transcript with the appropriate size factor for that cell. The function also log-transforms the normalized values, creating a new assay called \"logcounts\". (Technically, these are “log-transformed normalized expression values”). ## Compute normalized expression values and log-transformation sce.zeisel <- logNormCounts(sce.zeisel) assayNames(sce.zeisel) #> [1] "counts" "logcounts" The log-transformation is useful as differences in the log-values represent log-fold changes in expression. By operating on log-transformed data, we ensure that these procedures are measuring distances between cells based on log-fold changes in expression. Log-transformation focuses on the former by promoting contributions from genes with strong relative differences. 11.6 Feature selection highly variable genes (HVGs) We often use scRNA-seq data in exploratory analyses to characterize heterogeneity across cells. Procedures like clustering and dimensionality reduction compare cells based on their gene expression profiles, which involves aggregating per-gene differences into a single (dis)similarity metric between a pair of cells. The choice of genes to use in this calculation has a major impact on the behavior of the metric and the performance of downstream methods. We want to select genes that contain useful information about the biology of the system while removing genes that contain random noise. This aims to preserve interesting biological structure without the variance that obscures that structure, and to reduce the size of the data to improve computational efficiency of later steps. The simplest approach to feature selection is to select the most variable genes based on their expression across the population. This assumes that genuine biological differences will manifest as increased variation in the affected genes, compared to other genes that are only affected by technical noise or a baseline level of “uninteresting” biological variation. 11.6.1 Quantifying per-gene variation The simplest approach to quantifying per-gene variation is to compute the variance of the log-normalized expression values (“log-counts”) for each gene across all cells (A. T. L. Lun, McCarthy, and Marioni 2016). The advantage of this approach is that the feature selection is based on the same log-values that are used for later downstream steps. In particular, genes with the largest variances in log-values will contribute most to the Euclidean distances between cells during procedures like clustering and dimensionality reduction. By using log-values here, we ensure that our quantitative definition of heterogeneity is consistent throughout the entire analysis. Calculation of the per-gene variance is simple, but feature selection requires modelling of the mean-variance relationship. The log-transformation is not a variance stabilizing transformation in most cases, which means that the total variance of a gene is driven more by its abundance than its underlying biological heterogeneity. To account for this effect, we use the modelGeneVar() function to fit a trend to the variance with respect to abundance across all genes (Figure 3.1). library("scran") ## Model the mean-variance relationship dec.zeisel <- modelGeneVar(sce.zeisel) ## Plot the fit fit.zeisel <- metadata(dec.zeisel) plot(fit.zeisel$mean, fit.zeisel$var, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) curve(fit.zeisel$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) At any given abundance, we assume that the variation in expression for most genes is driven by uninteresting processes like sampling noise. Under this assumption, the fitted value of the trend at any given gene’s abundance represents an estimate of its uninteresting variation, which we call the technical component. We then define the biological component for each gene as the difference between its total variance and the technical component. This biological component represents the “interesting” variation for each gene and can be used as the metric for HVG selection. ## Order by most interesting genes for inspection dec.zeisel[order(dec.zeisel$bio, decreasing = TRUE), ] #> DataFrame with 19839 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Plp1 3.86637 15.44422 1.52686 13.91736 0.00000e+00 0.00000e+00 #> Trf 2.27790 9.95350 1.42611 8.52739 7.65644e-200 7.59060e-196 #> Mal 2.30761 9.19548 1.42963 7.76586 2.43043e-165 1.60635e-161 #> Apod 1.89630 7.78119 1.36339 6.41780 9.80407e-125 4.85988e-121 #> Mog 1.84701 7.30188 1.35204 5.94985 2.65362e-109 8.76934e-106 #> ... ... ... ... ... ... ... #> Ddx5 3.71905 0.762844 1.54756 -0.784711 0.994702 0.997822 #> [ reached getOption("max.print") -- omitted 4 rows ] 11.6.2 Quantifying technical noise (spike-ins) The assumptions made by quantifying per-gene variation may be problematic in rare scenarios where many genes at a particular abundance are affected by a biological process. For example, strong upregulation of cell type-specific genes may result in an enrichment of HVGs at high abundances. This would inflate the fitted trend in that abundance interval and compromise the detection of the relevant genes. We can avoid this problem by fitting a mean-dependent trend to the variance of the spike-in transcripts, if they are available. The premise here is that spike-ins should not be affected by biological variation, so the fitted value of the spike-in trend should represent a better estimate of the technical component for each gene. ## Fit a mean-dependent trend to the variance of the spike-in transcripts dec.spike.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC") ## Order by most interesting genes for inspection dec.spike.416b[order(dec.spike.416b$bio, decreasing = TRUE), ] #> DataFrame with 46604 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Lyz2 6.53871 13.5804 1.61389 11.9665 3.42011e-189 2.91162e-186 #> Top2a 5.78145 14.2104 2.56868 11.6418 4.26367e-72 1.20992e-69 #> Ccnb2 5.89731 13.6178 2.39162 11.2262 3.53970e-77 1.09579e-74 #> Ccl9 6.70039 12.4793 1.44789 11.0314 1.27204e-199 1.26341e-196 #> Hbb-bt 4.95051 15.0336 4.02525 11.0083 1.50804e-27 1.03594e-25 #> ... ... ... ... ... ... ... #> Rpl5-ps2 3.50523 0.831793 6.43592 -5.60413 0.999712 0.999857 #> [ reached getOption("max.print") -- omitted 4 rows ] ## Plot the fit plot(dec.spike.416b$mean, dec.spike.416b$total, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) fit.spike.416b <- metadata(dec.spike.416b) points(fit.spike.416b$mean, fit.spike.416b$var, col = "red", pch = 16) curve(fit.spike.416b$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) 11.6.3 Quantifying technical noise (mean-variance trend) In the absence of spike-in data, one can attempt to create a trend by making some distributional assumptions about the noise. For example, UMI counts typically exhibit near-Poisson variation if we only consider technical noise from library preparation and sequencing. This can be used to construct a mean-variance trend in the log-counts with the modelGeneVarByPoisson() function. ## construct a mean-variance trend in the log-counts set.seed(0010101) dec.pois.zeisel <- modelGeneVarByPoisson(sce.zeisel) ## Order by most interesting genes for inspection dec.pois.zeisel <- dec.pois.zeisel[order(dec.pois.zeisel$bio, decreasing = TRUE), ] head(dec.pois.zeisel) #> DataFrame with 6 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Plp1 3.86637 15.44422 0.226375 15.21785 0 0 #> Trf 2.27790 9.95350 0.635655 9.31785 0 0 #> Mal 2.30761 9.19548 0.626251 8.56923 0 0 #> Apod 1.89630 7.78119 0.748055 7.03313 0 0 #> Mog 1.84701 7.30188 0.760426 6.54146 0 0 #> Mbp 2.20638 6.74997 0.658112 6.09186 0 0 ## Plot the fit plot(dec.pois.zeisel$mean, dec.pois.zeisel$total, pch = 16, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) curve(metadata(dec.pois.zeisel)$trend(x), col = "dodgerblue", add = TRUE) Trends based purely on technical noise tend to yield large biological components for highly-expressed genes. This often includes so-called “house-keeping” genes coding for essential cellular components such as ribosomal proteins, which are considered uninteresting for characterizing cellular heterogeneity. These observations suggest that a more accurate noise model does not necessarily yield a better ranking of HVGs. Though, one should keep an open mind that house-keeping genes are regularly DE in a variety of conditions 11.6.4 Handling batch effects Data containing multiple batches will often exhibit batch effects. We are usually not interested in HVGs that are driven by batch effects; instead, we want to focus on genes that are highly variable within each batch. This is naturally achieved by performing trend fitting and variance decomposition separately for each batch. We will try now this approach by treating each plate (block) in the 416B dataset as a different batch, using the modelGeneVarWithSpikes() function. (The same argument is available in all other variance-modelling functions.) ## Fit a mean-dependent trend to the variance of the spike-in transcripts ## Independently for each batch (block) dec.block.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC", block = sce.416b$block) # block=sce.416b$block head(dec.block.416b[order(dec.block.416b$bio, decreasing = TRUE), 1:6]) #> DataFrame with 6 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Lyz2 6.53871 13.5779 1.63259 11.9453 0.00000e+00 0.00000e+00 #> Top2a 5.78145 13.9852 2.77254 11.2126 6.95319e-238 2.07179e-235 #> Ccl9 6.70039 12.5449 1.40416 11.1408 0.00000e+00 0.00000e+00 #> Hbb-bt 4.95051 15.0506 4.07362 10.9770 7.34341e-90 5.66488e-88 #> Ccnb2 5.89731 13.3673 2.60730 10.7600 1.02751e-282 3.88775e-280 #> Cd200r3 4.81056 14.9435 4.31950 10.6240 8.48013e-76 5.20981e-74 ## Plot the fit by batch (block) par(mfrow = c(1, 2)) blocked.stats <- dec.block.416b$per.block for (i in colnames(blocked.stats)) { current <- blocked.stats[[i]] plot(current$mean, current$total, main = i, pch = 16, cex = 0.5, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) curfit <- metadata(current) points(curfit$mean, curfit$var, col = "red", pch = 16) curve(curfit$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) } The use of a batch-specific trend fit is useful as it accommodates differences in the mean-variance trends between batches. This is especially important if batches exhibit systematic technical differences, e.g., differences in coverage or in the amount of spike-in RNA added. 11.6.5 Selecting highly variable genes Once we have quantified the per-gene variation, the next step is to select the subset of HVGs to use in downstream analyses. A larger subset will reduce the risk of discarding interesting biological signal by retaining more potentially relevant genes, at the cost of increasing noise from irrelevant genes that might obscure said signal. It is difficult to determine the optimal trade-off for any given application as noise in one context may be useful signal in another. The most obvious selection strategy is to take the top “n” genes with the largest values for the relevant variance metric. The main advantage of this approach is that the user can directly control the number of genes retained, which ensures that the computational complexity of downstream calculations is easily predicted. For modelGeneVar() and modelGeneVarWithSpikes(), we would select the genes with the largest biological components. This is conveniently done for us via getTopHVgs(), as shown with n = 1000. ## Top 1000 genes hvg.zeisel.var <- getTopHVGs(dec.zeisel, n = 1000) str(hvg.zeisel.var) #> chr [1:1000] "Plp1" "Trf" "Mal" "Apod" "Mog" "Mbp" "Car2" "Cnp" "Ugt8a" "Enpp2" "Meg3" "Mobp" "Ermn" "Ptgds" ... The choice of “n” also has a fairly straightforward biological interpretation. The main disadvantage of this approach that it turns HVG selection into a competition between genes, whereby a subset of very highly variable genes can push other informative genes out of the top set. This can be problematic for analyses of highly heterogeneous populations if the loss of important markers prevents the resolution of certain subpopulations. 11.7 Dimensionality reduction Many scRNA-seq analysis procedures involve comparing cells based on their expression values across multiple genes. For example, clustering aims to identify cells with similar transcriptomic profiles by computing Euclidean distances across genes. In these applications, each individual gene represents a dimension of the data. As the name suggests, dimensionality reduction aims to reduce the number of separate dimensions in the data. This is possible because different genes are correlated if they are affected by the same biological process. Thus, we do not need to store separate information for individual genes, but can instead compress multiple features into a single dimension, e.g., an “eigengene” (Langfelder and Horvath 2007). This reduces computational work in downstream analyses like clustering, as calculations only need to be performed for a few dimensions rather than thousands of genes; reduces noise by averaging across multiple genes to obtain a more precise representation of the patterns in the data; and enables effective plotting of the data, for those of us who are not capable of visualizing more than 3 dimensions. 11.7.1 Principal components analysis Principal components analysis (PCA) discovers axes in high-dimensional space that capture the largest amount of variation. This is best understood by imagining each axis as a line. Say we draw a line anywhere, and we move each cell in our data set onto the closest position on the line. The variance captured by this axis is defined as the variance in the positions of cells along that line. In PCA, the first axis (or “principal component”, PC) is chosen such that it maximizes this variance. The next PC is chosen such that it is orthogonal to the first and captures the greatest remaining amount of variation, and so on. By definition, the top PCs capture the dominant factors of heterogeneity in the data set. In the context of scRNA-seq, our assumption is that biological processes affect multiple genes in a coordinated manner. This means that the earlier PCs are likely to represent biological structure as more variation can be captured by considering the correlated behavior of many genes. By comparison, random technical or biological noise is expected to affect each gene independently. There is unlikely to be an axis that can capture random variation across many genes, meaning that noise should mostly be concentrated in the later PCs. This motivates the use of the earlier PCs in our downstream analyses, which concentrates the biological signal to simultaneously reduce computational work and remove noise. We can perform PCA on the log-normalized expression values using the fixedPCA() function from scran. By default, fixedPCA() will compute the first 50 PCs and store them in the reducedDims() of the output SingleCellExperiment object, as shown below. Here, we use only the top 2000 genes with the largest biological components to reduce both computational work and high-dimensional random noise. In particular, while PCA is robust to random noise, an excess of it may cause the earlier PCs to capture noise instead of biological structure (Johnstone and Lu 2009). library("scran") ## Top 2000 HVGs top.zeisel <- getTopHVGs(dec.zeisel, n = 2000) ## Principal component analysis using top 2000 HVGs, 50 PCs set.seed(100) sce.zeisel <- fixedPCA(sce.zeisel, subset.row = top.zeisel) reducedDimNames(sce.zeisel) #> [1] "PCA" 11.7.2 Choosing the number of PCs How many of the top PCs should we retain for downstream analyses? The choice of the number of PCs is an analogous decision to the choice of the number of HVGs to use. Using more PCs will retain more biological signal at the cost of including more noise that might mask said signal. On the other hand, using fewer PCs will introduce competition between different factors of variation, where weaker (but still interesting) factors may be pushed down into lower PCs and inadvertently discarded from downstream analyses. It is hard to determine whether an “optimal” choice exists for the number of PCs. Certainly, we could attempt to remove the technical variation that is almost always uninteresting. However, even if we were only left with biological variation, there is no straightforward way to automatically determine which aspects of this variation are relevant. Most practitioners will simply set to a “reasonable” but arbitrary value, typically ranging from 10 to 50. This is satisfactory depending of the amount of variance explained by that certain number of PCs. ## Variance explained by PCs percent.var <- attr(reducedDim(sce.zeisel), "percentVar") plot(percent.var, log = "y", xlab = "PC", ylab = "Variance explained (%)") 11.7.3 Visualizing the PCs Algorithms are more than happy to operate on 10-50 PCs, but these are still too many dimensions for human comprehension. To visualize the data, the most common and easy way is to use the top 2 PCs for plotting. library("scater") ## Plot PCA (Top 2 PCs for 2 dimentional visualization) plotReducedDim(sce.zeisel, dimred = "PCA", colour_by = "level1class") The problem is that PCA is a linear technique, i.e., only variation along a line in high-dimensional space is captured by each PC. As such, it cannot efficiently pack differences in d dimensions into the first 2 PCs. One workaround is to plot several of the top PCs against each other in pairwise plots. However, it is difficult to interpret multiple plots simultaneously, and even this approach is not sufficient to separate some of the annotated subpopulations. ## plot top 4 PCs against each other in pairwise plots plotReducedDim(sce.zeisel, dimred = "PCA", ncomponents = 4, colour_by = "level1class") 11.7.4 Non-linear methods for visualization 11.7.4.1 t-stochastic neighbor embedding The de facto standard for visualization of scRNA-seq data is the t-stochastic neighbor embedding (TSNE) method (Van der Maaten and Hinton 2008). This attempts to find a low-dimensional representation of the data that preserves the distances between each point and its neighbors in the high-dimensional space. Unlike PCA, it is not restricted to linear transformations, nor is it obliged to accurately represent distances between distant populations. This means that it has much more freedom in how it arranges cells in low-dimensional space, enabling it to separate many distinct clusters in a complex population ## TSNE using runTSNE() stores the t-SNE coordinates in the reducedDims set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA") ## Plot TSNE plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class") The “perplexity” is another important parameter that determines the granularity of the visualization. Low perplexities will favor resolution of finer structure, possibly to the point that the visualization is compromised by random noise. Thus, it is advisable to test different perplexity values to ensure that the choice of perplexity does not drive the interpretation of the plot. ## run TSNE using diferent perplexity numbers and plot ## TSNE using perplexity = 5 set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA", perplexity = 5) out5 <- plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class" ) + ggtitle("perplexity = 5") ## TSNE using perplexity = 20 set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA", perplexity = 20) out20 <- plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class" ) + ggtitle("perplexity = 20") ## TSNE using perplexity = 80 set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA", perplexity = 80) out80 <- plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class" ) + ggtitle("perplexity = 80") ## Combine plots gridExtra::grid.arrange(out5, out20, out80, ncol = 3) 11.7.4.2 Uniform manifold approximation and projection The uniform manifold approximation and projection (UMAP) method (McInnes, Healy, and Melville 2018) is an alternative to TSNE for non-linear dimensionality reduction. It is roughly similar to tSNE in that it also tries to find a low-dimensional representation that preserves relationships between neighbors in high-dimensional space. However, the two methods are based on different theory, represented by differences in the various graph weighting equations. This manifests as a different visualization. ## UMAP using runUMAP() stores the coordinates in the reducedDims set.seed(100) sce.zeisel <- runUMAP(sce.zeisel, dimred = "PCA") ## Plot UMAP plotReducedDim(sce.zeisel, dimred = "UMAP", colour_by = "level1class") Compared to tSNE, the UMAP visualization tends to have more compact visual clusters with more empty space between them. It also attempts to preserve more of the global structure than tSNE. From a practical perspective, UMAP is much faster than tSNE, which may be an important consideration for large datasets. UMAP also involves a series of randomization steps so setting the seed is critical. It is arguable whether the UMAP or tSNE visualizations are more useful or aesthetically pleasing. UMAP aims to preserve more global structure but this necessarily reduces resolution within each visual cluster. However, UMAP is unarguably much faster, and for that reason alone, it is increasingly displacing TSNE as the method of choice for visualizing large scRNA-seq data sets. 11.8 Clustering Clustering is an unsupervised learning procedure that is used to empirically define groups of cells with similar expression profiles. Its primary purpose is to summarize complex scRNA-seq data into a digestible format for human interpretation. This allows us to describe population heterogeneity in terms of discrete labels that are easily understood, rather than attempting to comprehend the high-dimensional manifold on which the cells truly reside. After annotation based on marker genes, the clusters can be treated as proxies for more abstract biological concepts such as cell types or cell states. At this point, it is helpful to realize that clustering, like a microscope, is simply a tool to explore the data. We can zoom in and out by changing the resolution of the clustering parameters, and we can experiment with different clustering algorithms to obtain alternative perspectives of the data. This iterative approach is entirely permissible given that data exploration constitutes the majority of the scRNA-seq data analysis workflow. As such, questions about the “correctness” of the clusters or the “true” number of clusters are usually meaningless. We can define as many clusters as we like, with whatever algorithm we like. Each clustering will represent its own partitioning of the high-dimensional expression space, and is as “real” as any other clustering. A more relevant question is “how well do the clusters approximate the cell types or states of interest?” Unfortunately, this is difficult to answer given the context-dependent interpretation of the underlying biology. Some analysts will be satisfied with resolution of the major cell types; other analysts may want resolution of subtypes; and others still may require resolution of different states (e.g., metabolic activity, stress) within those subtypes. Regardless of the exact method used, clustering is a critical step for extracting biological insights from scRNA-seq data. 11.8.1 Graph-based clustering Graph-based clustering is a flexible and scalable technique for clustering large scRNA-seq datasets. We first build a graph where each node is a cell that is connected to its nearest neighbors in the high-dimensional space. Edges are weighted based on the similarity between the cells involved, with higher weight given to cells that are more closely related. We then apply algorithms to identify “communities” of cells that are more connected to cells in the same community than they are to cells of different communities. Each community represents a cluster that we can use for downstream interpretation. The major advantage of graph-based clustering lies in its scalability. It only requires a k-nearest neighbor search that can be done in log-linear time on average, in contrast to hierachical clustering methods with runtimes that are quadratic with respect to the number of cells. Graph construction avoids making strong assumptions about the shape of the clusters or the distribution of cells within each cluster, compared to other methods like k-means (that favor spherical clusters) or Gaussian mixture models (that require normality). The main drawback of graph-based methods is that, after graph construction, no information is retained about relationships beyond the neighboring cells. To demonstrate, we use the clusterCells() function in scran on PBMC dataset. All calculations are performed using the top PCs to take advantage of data compression and denoising. This function returns a vector containing cluster assignments for each cell in our SingleCellExperiment object. By default, clusterCells() uses the 10 nearest neighbors of each cell to construct a shared nearest neighbor graph. Two cells are connected by an edge if any of their nearest neighbors are shared, with the edge weight defined from the highest average rank of the shared neighbors (Xu and Su 2015). The Walktrap method from the igraph package is then used to identify communities. library("scran") ## Cluster using "scran::clusterCells" nn.clusters <- clusterCells(sce.zeisel, use.dimred = "PCA") ## Cluster assignments table(nn.clusters) #> nn.clusters #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 #> 561 136 78 159 123 65 112 349 368 105 95 200 92 44 67 58 37 28 33 28 48 30 We assign the cluster assignments back into our SingleCellExperiment object as a factor in the column metadata. This allows us to conveniently visualize the distribution of clusters in a tSNE plot: ## Save the cluster assignments colLabels(sce.zeisel) <- nn.clusters ## Plot TSNE coloured by cluster assignments plotReducedDim(sce.zeisel, "TSNE", colour_by = "label") If we want to explicitly specify all of these parameters, we would use the more verbose call below. This uses a SNNGraphParam object from the bluster package to instruct clusterCells() to detect communities from a shared nearest-neighbor graph with the specified parameters. The appeal of this interface is that it allows us to easily switch to a different clustering algorithm by simply changing the BLUSPARAM argument. library(bluster) ## Clustering using k=10 nn.clusters2 <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = SNNGraphParam(k = 10, type = "rank", cluster.fun = "walktrap") ) table(nn.clusters2) #> nn.clusters2 #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 #> 561 136 78 159 123 65 112 349 368 105 95 200 92 44 67 58 37 28 33 28 48 30 We could also obtain the graph itself by specifying full=TRUE in the clusterCells() call. Doing so will return all intermediate structures that are used during clustering, including a graph object from the igraph package. ## Obtain the graph nn.clust.info <- clusterCells(sce.zeisel, use.dimred = "PCA", full = TRUE) head(nn.clust.info$objects$graph) #> 6 x 2816 sparse Matrix of class "dgCMatrix" #> #> [1,] . 8.5 9.5 9.5 9 8.5 8 5.5 8 6 4 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ...... #> #> .............................. #> ........suppressing 2762 columns and 4 rows in show(); maybe adjust options(max.print=, width=) #> .............................. #> #> [6,] 8.5 8 5.5 9 9 . 8.5 9 8 6 5 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ...... 11.8.2 Adjusting the parameters A graph-based clustering method has several key parameters: How many neighbors are considered when constructing the graph. What scheme is used to weight the edges. Which community detection algorithm is used to define the clusters. K Neighbors One of the most important parameters is k, the number of nearest neighbors used to construct the graph. This controls the resolution of the clustering where higher k yields a more inter-connected graph and broader clusters. Users can exploit this by experimenting with different values of k to obtain a satisfactory resolution. ## More resolved clustering using a smaller k (k=5) clust.5 <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(k = 5)) table(clust.5) #> clust.5 #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 #> 118 98 336 85 36 115 49 85 429 295 40 37 97 56 45 43 77 159 28 40 52 33 24 51 28 89 36 51 65 8 #> 31 32 33 34 35 36 37 #> 15 17 20 14 9 9 27 ## Less resolved clustering using a larger k (k=50) clust.50 <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(k = 50)) table(clust.50) #> clust.50 #> 1 2 3 4 5 #> 362 812 945 288 409 Edge weighting scheme Further tweaking can be performed by changing the edge weighting scheme during graph construction. Setting type = \"number\" will weight edges based on the number of nearest neighbors that are shared between two cells. Similarly, type = \"jaccard\" will weight edges according to the Jaccard index of the two sets of neighbors. We can also disable weighting altogether by using a simple k-nearest neighbor graph, which is occasionally useful for downstream graph operations that do not support weights. ## Cluster using the number of shared nearest neighbors (type="number") clust.num <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(type = "number") ) table(clust.num) #> clust.num #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 #> 128 161 129 457 128 116 78 309 397 205 60 96 70 62 35 13 46 51 30 31 52 28 15 58 34 27 ## Cluster using the Jaccard index (similarity between sample sets) clust.jaccard <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(type = "jaccard") ) table(clust.jaccard) #> clust.jaccard #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 #> 131 166 195 129 294 128 113 77 332 200 375 61 97 71 84 32 13 46 53 30 52 28 31 36 15 27 ## Cluster without specifying a graph type (default method-KNNGraphParam) clust.none <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = KNNGraphParam() ) table(clust.none) #> clust.none #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 #> 77 454 297 132 105 164 129 104 62 533 186 45 105 33 69 82 50 52 31 34 30 15 27 Community detection The community detection can be performed by using any of the algorithms provided by igraph. The Walktrap approach is a common one, but many others are available to choose from: clust.walktrap <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "walktrap") ) clust.louvain <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "louvain") ) clust.infomap <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "infomap") ) clust.fast <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "fast_greedy") ) clust.labprop <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "label_prop") ) clust.eigen <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "leading_eigen") ) 11.8.3 Hierarchical clustering Hierarchical clustering is an old technique that arranges samples into a hierarchy based on their relative similarity to each other. Most implementations do so by joining the most similar samples into a new cluster, then joining similar clusters into larger clusters, and so on, until all samples belong to a single cluster. This process yields obtain a dendrogram that defines clusters with progressively increasing granularity. Variants of hierarchical clustering methods primarily differ in how they choose to perform the agglomerations. For example, complete linkage aims to merge clusters with the smallest maximum distance between their elements, while Ward’s method aims to minimize the increase in within-cluster variance. In the context of scRNA-seq, the main advantage of hierarchical clustering lies in the production of the dendrogram. This is a rich summary that quantitatively captures the relationships between subpopulations at various resolutions.This can be helpful for interpretation. In practice, hierarchical clustering is too slow to be used for anything but the smallest scRNA-seq datasets. Most implementations require a cell-cell distance matrix that is prohibitively expensive to compute for a large number of cells. Greedy agglomeration is also likely to result in a quantitatively suboptimal partitioning (as defined by the agglomeration measure) at higher levels of the dendrogram when the number of cells and merge steps is high We use a HclustParam object to instruct clusterCells() to perform hierarchical clustering on the top PCs. Specifically, it computes a cell-cell distance matrix using the top PCs and then applies Ward’s minimum variance method to obtain a dendrogram. For this case, we will use the sce.416b library("scran") ## Top 2000 HVGs top.416b <- getTopHVGs(sce.416b, n = 2000) ## Principal component analysis using top 2000 HVGs, 50 PCs set.seed(100) sce.416b <- fixedPCA(sce.416b, subset.row = top.416b) ## TSNE sce.416b <- runTSNE(sce.416b, dimred = "PCA") library("dendextend") #> #> --------------------- #> Welcome to dendextend version 1.17.1 #> Type citation('dendextend') for how to cite the package. #> #> Type browseVignettes(package = 'dendextend') for the package vignette. #> The github page is: https://github.com/talgalili/dendextend/ #> #> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues #> You may ask questions at stackoverflow, use the r and dendextend tags: #> https://stackoverflow.com/questions/tagged/dendextend #> #> To suppress this message use: suppressPackageStartupMessages(library(dendextend)) #> --------------------- #> #> Attaching package: 'dendextend' #> The following object is masked from 'package:stats': #> #> cutree ## Perform hierarchical clustering on the PCA-reduced data from sce.416b ## The BLUSPARAM argument specifies the clustering method (here "ward.D2"). ## The full=TRUE argument ensures that additional objects related to clustering are returned. hclust.416b <- clusterCells(sce.416b, use.dimred = "PCA", BLUSPARAM = HclustParam(method = "ward.D2"), full = TRUE ) ## Extract the hierarchical clustering tree from the clustering result tree.416b <- hclust.416b$objects$hclust ## Customize the dendrogram for better visualization tree.416b$labels <- seq_along(tree.416b$labels) ## Convert the hierarchical clustering tree to a dendrogram object dend <- as.dendrogram(tree.416b, hang = 0.1) combined.fac <- paste0( sce.416b$block, ".", sub(" .*", "", sce.416b$phenotype) ) labels_colors(dend) <- c( "20160113.wild" = "blue", "20160113.induced" = "red", "20160325.wild" = "dodgerblue", "20160325.induced" = "salmon" )[combined.fac][order.dendrogram(dend)] ## Plot the dendrogram plot(dend) To obtain explicit clusters, we “cut” the tree by removing internal branches such that every subtree represents a distinct cluster. This is most simply done by removing internal branches above a certain height of the tree, as performed by the cutree() function. A more sophisticated variant of this approach is implemented in the dynamicTreeCut package, which uses the shape of the branches to obtain a better partitioning for complex dendrograms. We enable this option by setting cut.dynamic = TRUE, with additional tweaking of the deepSplit parameter to control the resolution of the resulting clusters. library("dynamicTreeCut") ## Perform hierarchical clustering with dynamic tree cut on the PCA ## The BLUSPARAM argument specifies the clustering method (here "ward.D2"), ## and enables dynamic tree cut (cut.dynamic=TRUE) with specific parameters. hclust.dyn <- clusterCells(sce.416b, use.dimred = "PCA", BLUSPARAM = HclustParam( method = "ward.D2", cut.dynamic = TRUE, cut.params = list(minClusterSize = 10, deepSplit = 1) ) ) table(hclust.dyn) #> hclust.dyn #> 1 2 3 4 #> 82 70 27 13 ## Plot dendogram labels_colors(dend) <- as.integer(hclust.dyn)[order.dendrogram(dend)] plot(dend) ## Obtain assignations and plot TSNE colLabels(sce.416b) <- factor(hclust.dyn) plotReducedDim(sce.416b, "TSNE", colour_by = "label") 11.8.4 Subclustering Another simple approach to improving resolution is to repeat the feature selection and clustering within a single cluster. This aims to select HVGs and PCs that are more relevant to internal structure, improving resolution by avoiding noise from unnecessary features. Subsetting also encourages clustering methods to separate cells according to more modest heterogeneity in the absence of distinct subpopulations. 11.9 Marker gene detection 11.10 Cell type annotation "],["introduction-to-spatial-transcriptomics.html", "12 Introduction to spatial transcriptomics 12.1 3’ Visium spatial technology 12.2 Spatial data visualization Bibliography", " 12 Introduction to spatial transcriptomics Instructor: Daianna Gonzalez-Padilla You might also be interested in this recent blog post by Leo https://lcolladotor.github.io/2024/05/23/humanpilot-first-spatially-resolved-transcriptomics-study-using-visium/ and the companion walk through video For a journal club presentation on the HumanPilot paper, check this video: In recent years, with constant improvements in the current sequencing technologies and the generation of more sophisticated omics methodologies and bioinformatic pipelines, we have been constantly demonstrating that specific cell types and cell-to-cell interactions play critical roles in the definition of numerous diseases and development-related processes. In fact, cell type-specific associations have been established for a number of diseases and disorders. Thus, understanding the cellular context and the spatial location in which normal and deregulated cellular events occur is necessary to unveil the molecular underpinnings of disease pathologies and malfunctions of the organisms. Spatial transcriptomics technologies are molecular profiling methods developed to measure gene expression levels in a tissue sample at the spatial resolution. These methods have been improved and expanded over time and are widely applied to study a wide range of biological processes and have provided numerous insights into disease and development mechanisms. In particular, the 10x Genomics Visium platform is a technology that spatially profiles the transcriptome of frozen and fixed tissue sections in combination with histology. 12.1 3’ Visium spatial technology This is the Visium technology more frequently used and it captures polyadenilated transcripts within individual spatially barcoded spots. In the Visium expression slide there are 4 capture areas, each of 6.5 (+1.5) mm\\(^2\\) with ~5k barcoded spots (55 µm in diameter each), within which mRNAs are captured by polyT primers that contain a read for sequencing (see below), a UMI (unique molecular identifier), and a spatial barcode. In this way, all RNAs trapped in the same spot are tagged with the same spot-specific barcode and we can computationally trace the original location of the transcripts. Figure 1: Schematic representation of the Visium capture areas and spots. Source: SciLifeLab (2023). 12.2 Spatial data visualization In order to interactively visualize example spatial data we’ll use the shiny web application of spatialLIBD: http://spatial.libd.org/spatialLIBD/. This web application allows to browse the human dorsolateral pre-frontal cortex (DLPFC) spatial transcriptomics data generated at the LIBD using the 10x Genomics Visium platform. In total there are 12 DLPFC tissue sections from 3 donors, each spanning six classical histological layers plus the white matter (WM). Figure 2: Human DLPFC tissue section. Spot plot depicting the 6 classical histological layers (L1-L6) and the white matter (WM) in a human DLPFC sample. 12.2.1 Spot-level data exploration With this tool you can: Observe per-spot QC metrics and gene expression levels Explore spot clusters in the tissue sections Visualize the spot data on reduced dimensions Manually annotate spots to layers and export your manual annotations Customize the spatial images p.exercise { background-color: #FFFAFA; padding: 15px; border: 2px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } 📝 Exercise 1: visualize the clustering of spots in all tissue sections using the different discrete variables to plot. Which one recapitulates better the six histological layers (plus the white matter) of the human DLPFC? 📝 Exercise 2: explore the expression of SNAP25 (neuronal marker gene), MOBP (oligodendrocyte/WM marker gene), and PCP4 (layer 5 marker gene) in each DLPFC tissue section. What do you observe? Are there any spatial patterns in the expression of these genes? 12.2.2 Layer-level data exploration Layer-level data result from pseudo-bulking the spot-level data, i.e. from aggregating spot data from all spots assigned to a given layer. At this level the tool allows to: Visualize the gene expression data at the layer level in reduced dimensions Plot the layer-level lognorm or raw expression of a gene across all tissue sections and extract DEGs among layers (ANOVA model), in a specific layer compared to the rest (enrichment model) or compared to another layer (pairwise model) Assess the enrichment of your own sets of genes of interest among the DEGs from these spatial DLPFC data Correlate gene-wise statistics for DE between sn/scRNA-seq data clusters/cell populations with the DE statistics in the human DLPFC layers provided in this study. This can be used to label your sn/scRNA-seq groups or clusters with the more molecularly-defined histological layers 📝 Exercise 3: plot the expression of SNAP25, MOBP, and PCP4 in the different layers of each DLPFC tissue section. Are there any significant differences in the expression of these genes between layers under any of the statistical models for DGE? p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 13px; font-family: sans-serif; } 👉🏼 There is also the spatialLIBD R/Bioconductor package you can use to interactively inspect your own spatial data in a shiny web app. Bibliography SciLifeLab (2023). 10X Genomics Visium for Fresh Frozen samples. Web site: https://ngisweden.scilifelab.se/methods/10x-visium/ 10x Genomics (n.d.). Whole transcriptome discovery in the tissue context. Web site: https://www.10xgenomics.com/platforms/visium "],["re-use-of-bulk-rna-seq-methods-for-spatial-data-exercise.html", "13 Re-use of bulk RNA-seq methods for spatial data exercise 13.1 Spatial registration 13.2 Exercise", " 13 Re-use of bulk RNA-seq methods for spatial data exercise Instructor: Leo New in @sciencemagazine: our work from @LieberInstitute #spatialDLPFC applies #snRNAseq and #Visium spatial transcriptomic in the DLPFC to better understand anatomical structure and cellular populations in the human brain #PsychENCODE https://t.co/DKZqmG4YDi https://t.co/Tjp2OjTo63 pic.twitter.com/vQbjts2JtQ — Louise Huuki-Myers (@lahuuki) May 23, 2024 13.1 Spatial registration In 2023, Louise A. Huuki-Myers contributed a new vignette to spatialLIBD as noted on the package news / changelog: http://research.libd.org/spatialLIBD/news/index.html#spatiallibd-1132. You should be able to run without any issues the code Louise explained at http://research.libd.org/spatialLIBD/articles/guide_to_spatial_registration.html. This same information is displayed at https://bioconductor.org/packages/release/data/experiment/vignettes/spatialLIBD/inst/doc/guide_to_spatial_registration.html. ## get reference layer enrichment statistics layer_modeling_results <- spatialLIBD::fetch_data(type = "modeling_results") #> adding rname 'https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1' #> 2024-06-10 23:30:27.833595 loading file /github/home/.cache/R/BiocFileCache/399798f9392_Human_DLPFC_Visium_modeling_results.Rdata%3Fdl%3D1 If the above doesn’t work, related to the curl issue we previously discussed, then use this workaround: tmp_modeling_results <- tempfile("modeling_results.RData") download.file( "https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1", tmp_modeling_results, mode = "wb" ) load(tmp_modeling_results, verbose = TRUE) #> Loading objects: #> modeling_results ## Let's rename the object into the name used in the ## spatial registration vignette (from spatialLIBD) layer_modeling_results <- modeling_results This journal club style video of the main results of the spatialDLPFC paper does explain the basics of spatial registration: For more on spatialDLPFC, check this second video about the supplementary results: 13.2 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise: Follow the vignette on spatial registration. Do the results change when you use cutoff_merge_ratio = 0.1? What is this argument controlling? "],["making-your-own-website-with-postcards.html", "14 Making your own website with postcards 14.1 here 14.2 Usethis 14.3 Git + GitHub 14.4 R websites 14.5 postcards 14.6 Create your own website with postcards! 14.7 References", " 14 Making your own website with postcards Instructor: Melissa Mayén Quiroz Welcome to “Making your own website with postcards”! Here we will explore essential tools and techniques to help you create and publish your own website using R and the postcards package. Content: here usethis Git + GitHub R websites postcards Create your own website with postcards! 14.1 here The here package is a powerful tool for managing file paths in your R projects. It helps you construct paths to files relative to your project’s root, ensuring your code is more robust and easier to share with others. Using here helps avoid issues with hard-coded paths and enhances the reproducibility of your analyses. The base directory it takes will be the one you are in when you load the here package, heuristically finding the root of the project and positioning itself there. In this case, the package is already installed so we just need to load it. ## Install the package manually # install.packages("here") ## Load "here" (previously installed) library("here") Sometimes there might be an error, as it might clash with other packages (like plyr). To avoid this, we can use here::here() (which basically clarifies that the requested function is from the here package). here::here() #> [1] "/__w/cshl_rstats_genome_scale_2024/cshl_rstats_genome_scale_2024" Some useful commands are getwd() and setwd(), which deal with the working directory, which is the default location where R looks for files to read or save. getwd() retrieves the current working directory. setwd() allows changing the current working directory. getwd() # returns the current path setwd("desired/directory") # changes to the specified path Best Practice: Instead of using “setwd” to manually set your working directory, it is often better to use the “here” package. Using “here” avoids issues with hard-coded paths and ensures your scripts work regardless of the specific setup of your working environment. ## Instead of "C:/Users/user/Desktop/data/myfile.csv" ## Use here to construct file paths file_path <- here("Users", "user", "Desktop", "data", "myfile.csv") # file_path <- here:here("Users", "user", "Desktop","data", "myfile.csv") data <- read.csv(file_path) Other examples of how “here” could be used: ## Example: save data to a file and load it a <- 1 c <- 23 save(a, c, file = here("test-data.RData")) # save(a, c, file = here:here("test-data.RData")) load(here("test-data.RData")) # load(here:here("test-data.RData")) ## Create a directory dir.create(here("subdirectory"), showWarnings = FALSE) # dir.create(here:here("subdirectory"), showWarnings = FALSE) ## Create a file, indicating the subdirectory (the first argument in this case) file.create(here("subdirectory", "filename")) #> [1] TRUE # file.create(here:here("subdirectory", "filename")) ## Open the new created file file.show(here("subdirectory", "filename")) # file.show(here:here("subdirectory", "filename")) ## For example, if we want to see our files in the directory list.files(here(), recursive = TRUE) #> [1] "_main_files/figure-html/CCA-1.png" #> [2] "_main_files/figure-html/cut_dendogram-1.png" #> [3] "_main_files/figure-html/cut_dendogram-2.png" #> [4] "_main_files/figure-html/EMM_example1-1.png" #> [5] "_main_files/figure-html/heat map-1.png" #> [6] "_main_files/figure-html/hist_libSizeFactors-1.png" #> [7] "_main_files/figure-html/hist_p-1.png" #> [8] "_main_files/figure-html/modelGeneVar_batch-1.png" #> [9] "_main_files/figure-html/modelGeneVar_zeisel-1.png" #> [10] "_main_files/figure-html/modelGeneVarByPoisson_zeisel-1.png" #> [11] "_main_files/figure-html/modelGeneVarWithSpikes_416b-1.png" #> [12] "_main_files/figure-html/PCs_zeisel-1.png" #> [13] "_main_files/figure-html/plot_clusters_zeisel-1.png" #> [14] "_main_files/figure-html/plot_dendogram-1.png" #> [15] "_main_files/figure-html/Plot_multiplePCA_PCs-1.png" #> [16] "_main_files/figure-html/QC_sce416b_plots-1.png" #> [17] "_main_files/figure-html/runTSNE_zeisel-1.png" #> [18] "_main_files/figure-html/TSNE_perplexity_plots-1.png" #> [19] "_main_files/figure-html/Umap_zeisel-1.png" #> [20] "_main_files/figure-html/unnamed-chunk-14-1.png" #> [21] "_main_files/figure-html/unnamed-chunk-15-1.png" #> [22] "_main_files/figure-html/unnamed-chunk-16-1.png" #> [23] "_main_files/figure-html/unnamed-chunk-17-1.png" #> [24] "_main_files/figure-html/unnamed-chunk-18-1.png" #> [25] "_main_files/figure-html/unnamed-chunk-19-1.png" #> [26] "_main_files/figure-html/VarExplained_PCs-1.png" #> [27] "_main_files/figure-html/volcano plot-1.png" #> [28] "_main_files/figure-html/voom-1.png" #> [29] "_main.Rmd" #> [30] "01_SummarizedExperiment.R" #> [31] "01_SummarizedExperiment.Rmd" #> [32] "02_iSEE.R" #> [33] "02_iSEE.Rmd" #> [34] "03_recount3_intro.R" #> [35] "03_recount3_intro.Rmd" #> [36] "04_DGE_analysis_overview.R" #> [37] "04_DGE_analysis_overview.Rmd" #> [38] "05_DGE_with_limma_voom.R" #> [39] "05_DGE_with_limma_voom.Rmd" #> [40] "06_ExploreModelMatrix.R" #> [41] "06_ExploreModelMatrix.Rmd" #> [42] "07_model_variable_selection.R" #> [43] "07_model_variable_selection.Rmd" #> [44] "08_DEG_exercise.R" #> [45] "08_DEG_exercise.Rmd" #> [46] "09_research_talks.R" #> [47] "09_research_talks.Rmd" #> [48] "10_biocthis_intro.R" #> [49] "10_biocthis_intro.Rmd" #> [50] "11_scRNAseq_overview.R" #> [ reached getOption("max.print") -- omitted 58 entries ] # list.files(here:here(), recursive = TRUE) 14.2 Usethis The usethis package simplifies many common setup tasks and workflows in R. It helps streamline the process of creating new projects, setting up Git repositories, and connecting with GitHub. Mastering usethis allows you to focus more on coding and less on configuration. In this case, the package is already installed so we just need to load it. ## Install the package manually # install.packages("usethis") ## Load "usethis (previously installed) library("usethis") Usage: All use_*() functions operate on the current directory. ## usethis::use_*() usethis::use_r() usethis::use_git() usethis::use_readme_md() ✔ indicates that usethis has setup everything for you. ● indicates that you’ll need to do some work yourself. ## For example, create a README file usethis::use_readme_md() #> ✔ Setting active project to '/__w/cshl_rstats_genome_scale_2024/cshl_rstats_genome_scale_2024' #> ✔ Writing 'README.md' More functions in usethis: usethis RDocumentation In the following exercises, we will see some uses of usethis. 14.3 Git + GitHub GitHub An Intro to Git and GitHub for Beginners (Tutorial) by HubSpot Version control is a critical skill. Git helps you track changes in your projects, collaborate with others, and maintain a history of your work. GitHub, a platform for hosting Git repositories, enables seamless collaboration and sharing of your projects with the world. Understanding Git and GitHub ensures your projects are well-organized and accessible. 14.3.1 Prerequisites We need a GitHub account. If you don’t have one, now is the time to create it! Create a GitHub account We also need to install Git on our computers as the gitcreds package requires it. Installing Git After installing Git, restart RStudio to allow it to annex. In this case, the packages are already installed so we just need to load them. # install.packages(c("gitcreds", "gert", "gh")) ## Load them separately library("gitcreds") library("gert") library("gh") 14.3.2 Creating a personal access token (PAT) To connect our RStudio repository with GitHub, we request a token, which allows GitHub to grant permission to our computer. You can request the token using R (choose a meaningful name). ## Initiate connection with GitHub usethis::create_github_token() # redirects to GitHub where you'll choose a specific name for the token Copy the token to enter it later with gitcreds_set() gitcreds::gitcreds_set() # here you place the token (NOT your GitHub password!!!) Another way to request the token is by going to GitHub Tokens, this option will provide a recommendation of the parameters to select. The token expiration parameter can be changed so it does not expire (for security, GitHub does not recommend this). Otherwise, consider its validity period. Once generated, you must save the token, as it will not appear again. You can always generate a new one (don’t forget to delete the previous token). The next step is to configure our GitHub user in the global .gitconfig file: ## Configure GitHub user usethis::edit_git_config() # opens the global .gitconfig file ## Place the name and email of your GitHub account. ## JUST remove the "#" and respect the other spaces # [user] # name = N A M E # email = github_email 14.3.3 Initialize Git and GitHub repository Now let’s initialize the repository in Git (locally on your computer) and then request to connect it with GitHub servers. Git is the software while GitHub is the web platform (based on Git) that allows collaboration. ## Initialize the Git repository usethis::use_git() ## Connect your local Git repository with GitHub servers usethis::use_github() ** Done ** Useful command to check configuration: gh::gh_whoami() 14.3.4 Some other gert commands Once we have linked our repository with GitHub, we can continue updating it. Some useful commands for this are: git_add() git_commit() git_log() git_push() ## Write a new file, using here::here to specify the path writeLines("hello", here::here("R", "test-here.R")) ## Another way is to use use_r usethis::use_r("test-file-github.R") # adds file to the project's R directory ## For example, we might try adding something new gert::git_add("R/test-file-github.R") ## Add commit of what was done gert::git_commit("uploaded test file") ## Gives info about the commits gert::git_log() ## Upload your changes from the local repo to GitHub gert::git_push() # IMPORTANT COMMAND It might be more user-friendly to use the Git pane that appears in RStudio :) 14.4 R websites Creating websites using R opens up new ways to share your analyses, reports, and research. Whether you are building static sites with R Markdown or dynamic applications with Shiny, R provides powerful tools to make your content interactive and engaging. Learning to create and deploy R websites enhances your ability to communicate your work effectively. 14.4.1 1. Set Up _site.yml Creating a website with R Markdown involves several key steps. First, you set up a _site.yml file, which configures the site’s name, navigation bar, and global options like themes and additional CSS or JavaScript files. This file ensures a consistent look and feel across all pages. YAML (.yml file) name: "My Website" output_dir: "docs" navbar: title: "My Website" left: - text: "Home" href: index.html - text: "About" href: about.html output: html_document: theme: cosmo highlight: tango 14.4.2 2. Create index.Rmd for the Homepage The homepage is created using an index.Rmd file, which acts as the main entry point for visitors, providing an introduction or overview of the site. Additional pages, such as about.Rmd, offer more detailed information about the website or its author. Markdown (index.Rmd file) --- title: "Welcome to My Website" author: "Your Name" date: "2024-06-10" output: html_document --- # Welcome to My Website This is a website created with R Markdown. Here you can share your analyses, reports, and research. ## Example Section Here is an example of a simple analysis: ## To insert a code block follow the sintaxis removing "#" !!! #` ``{r} summary(cars) # ``` 14.4.3 3. Render the Site To render the site, use the rmarkdown::render_site() function, which converts all R Markdown and Markdown files into HTML. The resulting HTML files and resources are placed in a directory, typically _site. RStudio facilitates this process with tools like the “Knit” button for individual pages and the “Build” pane for the entire site. Common elements, such as shared HTML files and CSS for styling, ensure consistency and avoid redundancy. A well-configured navigation bar enhances user experience by providing easy access to different sections. rmarkdown::render_site() 14.4.4 4. Publish the Website Publishing involves copying the contents of the _site directory to a web server, making your site accessible to others. For example, if you’re creating a personal blog, you would set up the _site.yml file with your site’s title and navigation links. The index.Rmd file would introduce your blog, while about.Rmd would provide information about you. After writing your blog posts in R Markdown files and rendering the site, you would upload the _site directory to your web server. 14.4.4.1 Choose a Hosting Platform: Consider platforms like GitHub Pages or Netlify for easy and free hosting. 14.4.4.2 Upload Files: For GitHub Pages, push your files to a GitHub repository named username.github.io. For Netlify, connect your GitHub repository and configure the deployment settings. 14.4.4.3 Configure Hosting: On GitHub Pages, enable GitHub Pages in the repository settings. On Netlify, configure the deployment settings to specify the build command (rmarkdown::render_site()) and output directory (docs if using _site.yml). Continuous Deployment (Netlify). If hosting on a different server, manually upload the files to your server using FTP or a similar method. 14.5 postcards The postcards package makes it easy to create beautiful, single-page websites with minimal effort. It’s perfect for personal websites, portfolios, and project showcases. Using postcards allows you to present your work professionally and creatively, without needing extensive web development knowledge. A collection of R Markdown templates for creating simple and easy-to-personalize single-page websites. “The goal of the package is to make it easy for anyone to create a one-page personal website using an R Markdown document.” Author: Sean Kross [aut, cre] Maintainer: Sean Kross <sean at seankross.com> https://CRAN.R-project.org/package=postcards GitHub: https://github.com/seankross/postcards Similar to https://pages.github.com/ Your webpage should say something about you, your interests, and your projects, as well as how to contact you. Some examples: https://amy-peterson.github.io/ via https://github.com/amy-peterson/amy-peterson.github.com http://jtleek.com/ via https://github.com/jtleek/jtleek.github.io http://aejaffe.com/ via https://github.com/andrewejaffe/andrewejaffe.github.io https://hadley.nz/ via https://github.com/hadley/hadley.github.com https://emarquezz.github.io/ via https://github.com/emarquezz/emarquezz.github.io https://bpardo99.github.io/ via https://github.com/bpardo99/bpardo99.github.io https://daianna21.github.io/ via https://github.com/daianna21/daianna21.github.io. 14.5.1 Installation In this case, the package is already installed. ## You can install Postcards with the following command: # install.packages("postcards") ## Or you can install the latest development version (not recommended): # remotes::install_github("seankross/postcards@main") 14.5.2 Templates Postcards include five templates: Jolla, Jolla Blue, Trestles, Onofre, and Solana. Each site is optimized for viewing on both desktop and mobile devices. The goal of the package is to make it easy for anyone to create a one-page personal website using an R Markdown document. Jolla: Jolla Blue: Trestles: Onofre: Solana: To start personalizing one of these templates, you need to create a new project. 14.6 Create your own website with postcards! Create your own website: Following the next steps you will be able to create your own personal website. You will need to have a GitHub account and connect Git. In case you missed it, you can go back to the “Git + GitHub” section. 14.6.1 Create a New Project in RStudio (Interactive Selection) If you use RStudio: Select “File”, “New Project”… Choose “New Directory”, “Postcards Website” Enter a directory name for your project in RStudio (“Your_Username.github.io”) Choose one of the templates from a dropdown menu Select “Create Project” after choosing a name for the folder that will contain your site. This folder will contain two important files: An R Markdown document with your site’s content A sample photo you should replace (with your own) ## Create a new project usethis::create_project("Your_Username.github.io") 14.6.2 Set Up Git and GitHub To save changes, you need to set up Git and GitHub ## Set up Git and GitHub usethis::use_git() # Restart the session usethis::use_github() 14.6.3 Choose a Template ## Choose only one template (the one you like the most) postcards::create_postcard(template = "jolla") postcards::create_postcard(template = "jolla-blue") postcards::create_postcard(template = "trestles") postcards::create_postcard(template = "onofre") postcards::create_postcard(template = "solana") In this way, you will also get the 2 important files: An R Markdown document with your site’s content A sample photo you should replace 14.6.4 Edit with Your Information Now you should edit the R Markdown document with your information and replace the image with one of your choice :) Fill in your information using the Markdown format. For example, https://github.com/andrewejaffe/andrewejaffe.github.io/blob/master/index.Rmd#L17-L31. Add your profiles in the style of https://github.com/andrewejaffe/andrewejaffe.github.io/blob/master/index.Rmd#L7-L12 14.6.5 Deploy the Page To compile the self-contained HTML file for the site: In RStudio, you can use the “Knit” button or directly: ## Deploy the GitHub page rmarkdown::render("index.Rmd") ** Done ** 14.7 References https://comunidadbioinfo.github.io/cdsb2021_scRNAseq/ejercicio-usando-usethis-here-y-postcards.html#vinculando-rstudio-con-git-y-github https://here.r-lib.org/ https://usethis.r-lib.org/ https://rmarkdown.rstudio.com/lesson-13.html https://bookdown.org/yihui/rmarkdown/rmarkdown-site.html https://product.hubspot.com/blog/git-and-github-tutorial-for-beginners https://github.com/Melii99/rnaseq_2024_postcards/blob/master/Actividad_postcards.Rmd https://lcolladotor.github.io/jhustatcomputing2023/projects/project-0/ "],["final-r-session.html", "Final R Session", " Final R Session This is the final R session after all the code in this book is run sequentially. #> ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────── #> setting value #> version R version 4.4.0 (2024-04-24) #> os Ubuntu 22.04.4 LTS #> system x86_64, linux-gnu #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz UTC #> date 2024-06-10 #> pandoc 3.1.13 @ /usr/bin/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> abind 1.4-5 2016-07-21 [1] RSPM (R 4.4.0) #> airway * 1.24.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.base 1.4.1 2024-05-03 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.matrix 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.ranges 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.sce 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.schemas 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.se 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) #> AnnotationDbi * 1.66.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> AnnotationFilter * 1.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> AnnotationHub * 3.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> aod 1.3.3 2023-12-13 [1] RSPM (R 4.4.0) #> askpass 1.2.0 2023-09-03 [2] RSPM (R 4.4.0) #> attempt 0.3.1 2020-05-03 [1] RSPM (R 4.4.0) #> backports 1.5.0 2024-05-23 [1] RSPM (R 4.4.0) #> base64enc 0.1-3 2015-07-28 [2] RSPM (R 4.4.0) #> beachmat 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> beeswarm 0.4.0 2021-06-01 [1] RSPM (R 4.4.0) #> benchmarkme 1.0.8 2022-06-12 [1] RSPM (R 4.4.0) #> benchmarkmeData 1.0.4 2020-04-23 [1] RSPM (R 4.4.0) #> bibtex 0.5.1 2023-01-26 [1] RSPM (R 4.4.0) #> Biobase * 2.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocFileCache * 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocGenerics * 0.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocIO 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocManager 1.30.23 2024-05-04 [2] CRAN (R 4.4.0) #> BiocNeighbors 1.22.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocParallel * 1.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocSingular 1.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocStyle * 2.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> biocthis * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocVersion 3.19.1 2024-04-17 [2] Bioconductor 3.19 (R 4.4.0) #> Biostrings 2.72.1 2024-06-02 [1] Bioconductor 3.19 (R 4.4.0) #> bit 4.0.5 2022-11-15 [1] RSPM (R 4.4.0) #> bit64 4.0.5 2020-08-30 [1] RSPM (R 4.4.0) #> bitops 1.0-7 2021-04-24 [1] RSPM (R 4.4.0) #> blob 1.2.4 2023-03-17 [1] RSPM (R 4.4.0) #> bluster * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> bookdown 0.39 2024-04-15 [1] RSPM (R 4.4.0) #> boot 1.3-30 2024-02-26 [3] CRAN (R 4.4.0) #> brio 1.1.5 2024-04-24 [2] RSPM (R 4.4.0) #> broom 1.0.6 2024-05-17 [1] RSPM (R 4.4.0) #> bslib 0.7.0 2024-03-29 [2] RSPM (R 4.4.0) #> cachem 1.1.0 2024-05-16 [2] RSPM (R 4.4.0) #> Cairo 1.6-2 2023-11-28 [1] RSPM (R 4.4.0) #> caTools 1.18.2 2021-03-28 [1] RSPM (R 4.4.0) #> checkmate 2.3.1 2023-12-04 [1] RSPM (R 4.4.0) #> circlize * 0.4.16 2024-02-20 [1] RSPM (R 4.4.0) #> cli 3.6.2 2023-12-11 [2] RSPM (R 4.4.0) #> clue 0.3-65 2023-09-23 [1] RSPM (R 4.4.0) #> cluster 2.1.6 2023-12-01 [3] CRAN (R 4.4.0) #> codetools 0.2-20 2024-03-31 [3] CRAN (R 4.4.0) #> colorspace 2.1-0 2023-01-23 [1] RSPM (R 4.4.0) #> colourpicker 1.3.0 2023-08-21 [1] RSPM (R 4.4.0) #> ComplexHeatmap * 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> config 0.3.2 2023-08-30 [1] RSPM (R 4.4.0) #> corpcor 1.6.10 2021-09-16 [1] RSPM (R 4.4.0) #> cowplot * 1.1.3 2024-01-22 [1] RSPM (R 4.4.0) #> crayon 1.5.2 2022-09-29 [2] RSPM (R 4.4.0) #> credentials 2.0.1 2023-09-06 [2] RSPM (R 4.4.0) #> curl 5.2.1 2024-03-01 [1] RSPM (R 4.4.0) #> data.table 1.15.4 2024-03-30 [1] RSPM (R 4.4.0) #> DBI 1.2.3 2024-06-02 [1] RSPM (R 4.4.0) #> dbplyr * 2.5.0 2024-03-19 [1] RSPM (R 4.4.0) #> DelayedArray 0.30.1 2024-05-07 [1] Bioconductor 3.19 (R 4.4.0) #> DelayedMatrixStats 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> dendextend * 1.17.1 2023-03-25 [1] RSPM (R 4.4.0) #> desc 1.4.3 2023-12-10 [2] RSPM (R 4.4.0) #> digest 0.6.35 2024-03-11 [2] RSPM (R 4.4.0) #> doParallel 1.0.17 2022-02-07 [1] RSPM (R 4.4.0) #> dotCall64 1.1-1 2023-11-28 [1] RSPM (R 4.4.0) #> dplyr 1.1.4 2023-11-17 [1] RSPM (R 4.4.0) #> dqrng 0.4.1 2024-05-28 [1] RSPM (R 4.4.0) #> DT 0.33 2024-04-04 [1] RSPM (R 4.4.0) #> dynamicTreeCut * 1.63-1 2016-03-11 [1] RSPM (R 4.4.0) #> edgeR * 4.2.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> ensembldb * 2.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> EnvStats 2.8.1 2023-08-22 [1] RSPM (R 4.4.0) #> evaluate 0.23 2023-11-01 [2] RSPM (R 4.4.0) #> ExperimentHub * 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> ExploreModelMatrix * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> fANCOVA 0.6-1 2020-11-13 [1] RSPM (R 4.4.0) #> fansi 1.0.6 2023-12-08 [2] RSPM (R 4.4.0) #> farver 2.1.2 2024-05-13 [1] RSPM (R 4.4.0) #> fastmap 1.2.0 2024-05-15 [2] RSPM (R 4.4.0) #> fields 15.2 2023-08-17 [1] RSPM (R 4.4.0) #> filelock 1.0.3 2023-12-11 [1] RSPM (R 4.4.0) #> FNN 1.1.4 2024-01-12 [1] RSPM (R 4.4.0) #> foreach 1.5.2 2022-02-02 [1] RSPM (R 4.4.0) #> foreign 0.8-86 2023-11-28 [3] CRAN (R 4.4.0) #> Formula 1.2-5 2023-02-24 [1] RSPM (R 4.4.0) #> fs 1.6.4 2024-04-25 [2] RSPM (R 4.4.0) #> generics 0.1.3 2022-07-05 [1] RSPM (R 4.4.0) #> GenomeInfoDb * 1.40.1 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) #> GenomeInfoDbData 1.2.12 2024-05-26 [1] Bioconductor #> GenomicAlignments 1.40.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> GenomicFeatures * 1.56.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> GenomicRanges * 1.56.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> gert * 2.0.1 2023-12-04 [2] RSPM (R 4.4.0) #> GetoptLong 1.0.5 2020-12-15 [1] RSPM (R 4.4.0) #> ggbeeswarm 0.7.2 2023-04-29 [1] RSPM (R 4.4.0) #> ggplot2 * 3.5.1 2024-04-23 [1] RSPM (R 4.4.0) #> ggrepel * 0.9.5 2024-01-10 [1] RSPM (R 4.4.0) #> gh * 1.4.1 2024-03-28 [2] RSPM (R 4.4.0) #> gitcreds * 0.1.2 2022-09-08 [2] RSPM (R 4.4.0) #> GlobalOptions 0.1.2 2020-06-10 [1] RSPM (R 4.4.0) #> glue 1.7.0 2024-01-09 [2] RSPM (R 4.4.0) #> golem 0.4.1 2023-06-05 [1] RSPM (R 4.4.0) #> gplots 3.1.3.1 2024-02-02 [1] RSPM (R 4.4.0) #> gridExtra 2.3 2017-09-09 [1] RSPM (R 4.4.0) #> gtable 0.3.5 2024-04-22 [1] RSPM (R 4.4.0) #> gtools 3.9.5 2023-11-20 [1] RSPM (R 4.4.0) #> gypsum 1.0.1 2024-05-08 [1] Bioconductor 3.19 (R 4.4.0) #> HDF5Array 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> here * 1.0.1 2020-12-13 [1] RSPM (R 4.4.0) #> highr 0.11 2024-05-26 [2] RSPM (R 4.4.0) #> Hmisc * 5.1-3 2024-05-28 [1] RSPM (R 4.4.0) #> htmlTable 2.4.2 2023-10-29 [1] RSPM (R 4.4.0) #> htmltools 0.5.8.1 2024-04-04 [2] RSPM (R 4.4.0) #> htmlwidgets 1.6.4 2023-12-06 [2] RSPM (R 4.4.0) #> httpuv 1.6.15 2024-03-26 [2] RSPM (R 4.4.0) #> httr 1.4.7 2023-08-15 [2] RSPM (R 4.4.0) #> httr2 1.0.1 2024-04-01 [2] RSPM (R 4.4.0) #> igraph 2.0.3 2024-03-13 [1] RSPM (R 4.4.0) #> IRanges * 2.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> irlba 2.3.5.1 2022-10-03 [1] RSPM (R 4.4.0) #> iSEE * 2.16.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> iterators 1.0.14 2022-02-05 [1] RSPM (R 4.4.0) #> jquerylib 0.1.4 2021-04-26 [2] RSPM (R 4.4.0) #> jsonlite 1.8.8 2023-12-04 [2] RSPM (R 4.4.0) #> KEGGREST 1.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> KernSmooth 2.23-24 2024-05-17 [3] RSPM (R 4.4.0) #> knitr 1.47 2024-05-29 [2] RSPM (R 4.4.0) #> labeling 0.4.3 2023-08-29 [1] RSPM (R 4.4.0) #> later 1.3.2 2023-12-06 [2] RSPM (R 4.4.0) #> lattice 0.22-6 2024-03-20 [3] CRAN (R 4.4.0) #> lazyeval 0.2.2 2019-03-15 [1] RSPM (R 4.4.0) #> lifecycle 1.0.4 2023-11-07 [2] RSPM (R 4.4.0) #> limma * 3.60.2 2024-05-19 [1] Bioconductor 3.19 (R 4.4.0) #> listviewer 4.0.0 2023-09-30 [1] RSPM (R 4.4.0) #> lme4 1.1-35.3 2024-04-16 [1] RSPM (R 4.4.0) #> lmerTest 3.1-3 2020-10-23 [1] RSPM (R 4.4.0) #> lobstr * 1.1.2 2022-06-22 [1] RSPM (R 4.4.0) #> locfit 1.5-9.9 2024-03-01 [1] RSPM (R 4.4.0) #> lubridate 1.9.3 2023-09-27 [1] RSPM (R 4.4.0) #> magick 2.8.3 2024-02-18 [1] RSPM (R 4.4.0) #> magrittr 2.0.3 2022-03-30 [2] RSPM (R 4.4.0) #> maps 3.4.2 2023-12-15 [1] RSPM (R 4.4.0) #> MASS 7.3-60.2 2024-05-06 [3] local #> Matrix 1.7-0 2024-03-22 [3] CRAN (R 4.4.0) #> MatrixGenerics * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> matrixStats * 1.3.0 2024-04-11 [1] RSPM (R 4.4.0) #> memoise 2.0.1 2021-11-26 [2] RSPM (R 4.4.0) #> metapod 1.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> mgcv 1.9-1 2023-12-21 [3] CRAN (R 4.4.0) #> mime 0.12 2021-09-28 [2] RSPM (R 4.4.0) #> miniUI 0.1.1.1 2018-05-18 [2] RSPM (R 4.4.0) #> minqa 1.2.7 2024-05-20 [1] RSPM (R 4.4.0) #> munsell 0.5.1 2024-04-01 [1] RSPM (R 4.4.0) #> mvtnorm 1.2-5 2024-05-21 [1] RSPM (R 4.4.0) #> nlme 3.1-165 2024-06-06 [3] RSPM (R 4.4.0) #> nloptr 2.0.3 2022-05-26 [1] RSPM (R 4.4.0) #> nnet 7.3-19 2023-05-03 [3] CRAN (R 4.4.0) #> numDeriv 2016.8-1.1 2019-06-06 [1] RSPM (R 4.4.0) #> openssl 2.2.0 2024-05-16 [2] RSPM (R 4.4.0) #> paletteer 1.6.0 2024-01-21 [1] RSPM (R 4.4.0) #> patchwork * 1.2.0 2024-01-08 [1] RSPM (R 4.4.0) #> pbkrtest 0.5.2 2023-01-19 [1] RSPM (R 4.4.0) #> pheatmap * 1.0.12 2019-01-04 [1] RSPM (R 4.4.0) #> pillar 1.9.0 2023-03-22 [2] RSPM (R 4.4.0) #> pkgconfig 2.0.3 2019-09-22 [2] RSPM (R 4.4.0) #> pkgload 1.3.4 2024-01-16 [2] RSPM (R 4.4.0) #> plotly 4.10.4 2024-01-13 [1] RSPM (R 4.4.0) #> plyr 1.8.9 2023-10-02 [1] RSPM (R 4.4.0) #> png 0.1-8 2022-11-29 [1] RSPM (R 4.4.0) #> Polychrome * 1.5.1 2022-05-03 [1] RSPM (R 4.4.0) #> postcards * 0.2.3 2022-01-07 [1] RSPM (R 4.4.0) #> praise 1.0.0 2015-08-11 [2] RSPM (R 4.4.0) #> prettyunits 1.2.0 2023-09-24 [2] RSPM (R 4.4.0) #> promises 1.3.0 2024-04-05 [2] RSPM (R 4.4.0) #> ProtGenerics 1.36.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> purrr 1.0.2 2023-08-10 [2] RSPM (R 4.4.0) #> R.cache 0.16.0 2022-07-21 [1] RSPM (R 4.4.0) #> R.methodsS3 1.8.2 2022-06-13 [1] RSPM (R 4.4.0) #> R.oo 1.26.0 2024-01-24 [1] RSPM (R 4.4.0) #> R.utils 2.12.3 2023-11-18 [1] RSPM (R 4.4.0) #> R6 2.5.1 2021-08-19 [2] RSPM (R 4.4.0) #> rappdirs 0.3.3 2021-01-31 [2] RSPM (R 4.4.0) #> rbibutils 2.2.16 2023-10-25 [1] RSPM (R 4.4.0) #> RColorBrewer * 1.1-3 2022-04-03 [1] RSPM (R 4.4.0) #> Rcpp 1.0.12 2024-01-09 [2] RSPM (R 4.4.0) #> RCurl 1.98-1.14 2024-01-09 [1] RSPM (R 4.4.0) #> Rdpack 2.6 2023-11-08 [1] RSPM (R 4.4.0) #> recount3 * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> RefManageR * 1.4.0 2022-09-30 [1] RSPM (R 4.4.0) #> remaCor 0.0.18 2024-02-08 [1] RSPM (R 4.4.0) #> rematch2 2.1.2 2020-05-01 [2] RSPM (R 4.4.0) #> reshape2 1.4.4 2020-04-09 [1] RSPM (R 4.4.0) #> restfulr 0.0.15 2022-06-16 [1] RSPM (R 4.4.0) #> rhdf5 2.48.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> rhdf5filters 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> Rhdf5lib 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> RhpcBLASctl 0.23-42 2023-02-11 [1] RSPM (R 4.4.0) #> rintrojs 0.3.4 2024-01-11 [1] RSPM (R 4.4.0) #> rjson 0.2.21 2022-01-09 [1] RSPM (R 4.4.0) #> rlang * 1.1.4 2024-06-04 [2] RSPM (R 4.4.0) #> rmarkdown 2.27 2024-05-17 [2] RSPM (R 4.4.0) #> rpart 4.1.23 2023-12-05 [3] CRAN (R 4.4.0) #> rprojroot 2.0.4 2023-11-05 [2] RSPM (R 4.4.0) #> Rsamtools 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> RSQLite 2.3.7 2024-05-27 [1] RSPM (R 4.4.0) #> rstudioapi 0.16.0 2024-03-24 [2] RSPM (R 4.4.0) #> rsvd 1.0.5 2021-04-16 [1] RSPM (R 4.4.0) #> rtracklayer 1.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> Rtsne 0.17 2023-12-07 [1] RSPM (R 4.4.0) #> S4Arrays 1.4.1 2024-05-20 [1] Bioconductor 3.19 (R 4.4.0) #> S4Vectors * 0.42.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> sass 0.4.9 2024-03-15 [2] RSPM (R 4.4.0) #> ScaledMatrix 1.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> scales 1.3.0 2023-11-28 [1] RSPM (R 4.4.0) #> scater * 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> scatterplot3d 0.3-44 2023-05-05 [1] RSPM (R 4.4.0) #> scran * 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> scRNAseq * 2.18.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) #> scuttle * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> sessioninfo * 1.2.2 2021-12-06 [2] RSPM (R 4.4.0) #> shape 1.4.6.1 2024-02-23 [1] RSPM (R 4.4.0) #> shiny 1.8.1.1 2024-04-02 [2] RSPM (R 4.4.0) #> shinyAce 0.4.2 2022-05-06 [1] RSPM (R 4.4.0) #> shinydashboard 0.7.2 2021-09-30 [1] RSPM (R 4.4.0) #> shinyjs 2.1.0 2021-12-23 [1] RSPM (R 4.4.0) #> shinyWidgets 0.8.6 2024-04-24 [1] RSPM (R 4.4.0) #> SingleCellExperiment * 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> smokingMouse * 0.99.91 2024-06-10 [1] Github (LieberInstitute/smokingMouse@96d8480) #> spam 2.10-0 2023-10-23 [1] RSPM (R 4.4.0) #> SparseArray 1.4.8 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) #> sparseMatrixStats 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> SpatialExperiment * 1.14.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> spatialLIBD * 1.16.2 2024-05-28 [1] Bioconductor 3.19 (R 4.4.0) #> statmod 1.5.0 2023-01-06 [1] RSPM (R 4.4.0) #> stringi 1.8.4 2024-05-06 [2] RSPM (R 4.4.0) #> stringr * 1.5.1 2023-11-14 [2] RSPM (R 4.4.0) #> styler 1.10.3 2024-04-07 [1] RSPM (R 4.4.0) #> SummarizedExperiment * 1.34.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> sys 3.4.2 2023-05-23 [2] RSPM (R 4.4.0) #> testthat * 3.2.1.1 2024-04-14 [2] RSPM (R 4.4.0) #> tibble 3.2.1 2023-03-20 [2] RSPM (R 4.4.0) #> tidyr 1.3.1 2024-01-24 [1] RSPM (R 4.4.0) #> tidyselect 1.2.1 2024-03-11 [1] RSPM (R 4.4.0) #> timechange 0.3.0 2024-01-18 [1] RSPM (R 4.4.0) #> UCSC.utils 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> usethis * 2.2.3 2024-02-19 [2] RSPM (R 4.4.0) #> utf8 1.2.4 2023-10-22 [2] RSPM (R 4.4.0) #> uwot 0.2.2 2024-04-21 [1] RSPM (R 4.4.0) #> variancePartition * 1.34.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> vctrs 0.6.5 2023-12-01 [2] RSPM (R 4.4.0) #> vipor 0.4.7 2023-12-18 [1] RSPM (R 4.4.0) #> viridis 0.6.5 2024-01-29 [1] RSPM (R 4.4.0) #> viridisLite 0.4.2 2023-05-02 [1] RSPM (R 4.4.0) #> whisker 0.4.1 2022-12-05 [2] RSPM (R 4.4.0) #> withr 3.0.0 2024-01-16 [2] RSPM (R 4.4.0) #> xfun 0.44 2024-05-15 [2] RSPM (R 4.4.0) #> XML 3.99-0.16.1 2024-01-22 [1] RSPM (R 4.4.0) #> xml2 1.3.6 2023-12-04 [2] RSPM (R 4.4.0) #> xtable 1.8-4 2019-04-21 [2] RSPM (R 4.4.0) #> XVector 0.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> yaml 2.3.8 2023-12-11 [2] RSPM (R 4.4.0) #> zlibbioc 1.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> #> [1] /__w/_temp/Library #> [2] /usr/local/lib/R/site-library #> [3] /usr/local/lib/R/library #> #> ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── System curl version information: curl::curl_version() #> $version #> [1] "7.81.0" #> #> $ssl_version #> [1] "OpenSSL/3.0.2" #> #> $libz_version #> [1] "1.2.11" #> #> $libssh_version #> [1] "libssh/0.9.6/openssl/zlib" #> #> $libidn_version #> [1] "2.3.2" #> #> $host #> [1] "x86_64-pc-linux-gnu" #> #> $protocols #> [1] "dict" "file" "ftp" "ftps" "gopher" "gophers" "http" "https" "imap" "imaps" "ldap" #> [12] "ldaps" "mqtt" "pop3" "pop3s" "rtmp" "rtsp" "scp" "sftp" "smb" "smbs" "smtp" #> [23] "smtps" "telnet" "tftp" #> #> $ipv6 #> [1] TRUE #> #> $http2 #> [1] TRUE #> #> $idn #> [1] TRUE This interactive book was last updated at 2024-06-10 23:30:30.562128. "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]
+[["index.html", "Statistical Analysis of Genome Scale Data 2024 Overview Download course materials Code of Conduct Course Schedule External links Course Prerequisites R session information License", " Statistical Analysis of Genome Scale Data 2024 Leonardo Collado-Torres Overview Here you can find the files for the June 2024 Statistical Analysis of Genome Scale Data course at CSHL portion taught by Leo and his team (June 9-11). Instructor: Leonardo Collado-Torres, Twitter Teaching assistants: Daianna González Padilla, Twitter Melissa Mayén Quiroz, Twitter Thanks again Sean Davis for inviting us to help teach https://t.co/KulvuQ3XK8 at @cshlcourses!Our teaching materials are available at https://t.co/OP2YYZmqwh 📚 It’s an honor to teach with @lcgunam students & remote @LieberInstitute team members#rstats @Bioconductor pic.twitter.com/U4DzQuPvIn — 🇲🇽 Leonardo Collado-Torres (@lcolladotor) June 9, 2024 Download course materials Download the materials for this course with usethis::use_course('lcolladotor/cshl_rstats_genome_scale_2024') or view online at lcolladotor.github.io/cshl_rstats_genome_scale_2024. This command downloads a static version of the course materials. If you want to be able to easily download updates, we recommend using Git. Happy Git and GitHub for the useR is great for getting your computer ready to use Git and GitHub. If you already have a GitHub account, you can instead use this command to download the course: ## Download it the first time git clone https://github.com/lcolladotor/cshl_rstats_genome_scale_2024.git ## To update the contents, use: cd cshl_rstats_genome_scale_2024 git pull Or you could use the GitHub Desktop application. Code of Conduct We’ll follow the CSHL code of conduct as well as version 1.2.0 of the Bioconductor code of conduct bioconductor.github.io/bioc_coc_multilingual/. For reporting any violations of the code of conduct, report them to the Instructor and/or Course Coordinators. Course Schedule Local times in US Eastern See CSHLData2024 for the detailed schedule. External links CSHL course GitHub source code Slack LieberInstitute/template_project LIBD rstats club: check the public schedule 2023 course version Course Prerequisites Install R 4.4.x from CRAN then install the following R packages: ## For installing Bioconductor packages if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } ## Install required packages BiocManager::install( c( "usethis", ## Utilities "BiocFileCache", "RefManageR", "gitcreds", "gert", "gh", "here", "Hmisc", "biocthis", "lobstr", "postcards", "scater", "sessioninfo", "stringr", "SummarizedExperiment", ## Main containers / vis "iSEE", "edgeR", ## RNA-seq "ExploreModelMatrix", "limma", "smokingMouse", "recount3", "rlang", "scRNAseq", "airway", "pheatmap", ## Visualization "ggplot2", "ggrepel", "patchwork", "RColorBrewer", "ComplexHeatmap", "cowplot", "Polychrome", "spatialLIBD", ## Advanced "variancePartition" ) ) You will also need to install RStudio version 2024.04.0+735 or newer. R session information Details on the R version used for making this book. The source code is available at lcolladotor/cshl_rstats_genome_scale_2024. ## Load the package at the top of your script library("sessioninfo") ## Utilities library("BiocFileCache") library("BiocStyle") library("biocthis") library("gitcreds") library("gert") library("gh") library("here") library("lobstr") library("postcards") library("usethis") library("sessioninfo") ## Data library("smokingMouse") library("scRNAseq") ## Main containers / vis library("SummarizedExperiment") library("iSEE") ## RNA-seq library("airway") library("edgeR") library("ExploreModelMatrix") library("limma") library("recount3") ## QCA library("scater") ## Variance Partition library("variancePartition") ## Visualization: plots & text library("ComplexHeatmap") library("ggplot2") library("patchwork") library("pheatmap") library("RColorBrewer") library("Hmisc") library("stringr") library("cowplot") library("rlang") library("ggrepel") library("Polychrome") ## Spatial transcriptomics library("spatialLIBD") ## Reproducibility information options(width = 120) session_info() ## ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────── ## setting value ## version R version 4.4.0 (2024-04-24) ## os Ubuntu 22.04.4 LTS ## system x86_64, linux-gnu ## ui X11 ## language (EN) ## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz UTC ## date 2024-06-11 ## pandoc 3.1.13 @ /usr/bin/ (via rmarkdown) ## ## ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── ## package * version date (UTC) lib source ## abind 1.4-5 2016-07-21 [1] RSPM (R 4.4.0) ## airway * 1.24.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.base 1.4.1 2024-05-03 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.matrix 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.ranges 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.sce 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.schemas 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## alabaster.se 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) ## AnnotationDbi 1.66.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## AnnotationFilter 1.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## AnnotationHub 3.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## aod 1.3.3 2023-12-13 [1] RSPM (R 4.4.0) ## askpass 1.2.0 2023-09-03 [2] RSPM (R 4.4.0) ## attempt 0.3.1 2020-05-03 [1] RSPM (R 4.4.0) ## backports 1.5.0 2024-05-23 [1] RSPM (R 4.4.0) ## base64enc 0.1-3 2015-07-28 [2] RSPM (R 4.4.0) ## beachmat 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## beeswarm 0.4.0 2021-06-01 [1] RSPM (R 4.4.0) ## benchmarkme 1.0.8 2022-06-12 [1] RSPM (R 4.4.0) ## benchmarkmeData 1.0.4 2020-04-23 [1] RSPM (R 4.4.0) ## Biobase * 2.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocFileCache * 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocGenerics * 0.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocIO 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocManager 1.30.23 2024-05-04 [2] CRAN (R 4.4.0) ## BiocNeighbors 1.22.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocParallel * 1.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocSingular 1.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocStyle * 2.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## biocthis * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## BiocVersion 3.19.1 2024-04-17 [2] Bioconductor 3.19 (R 4.4.0) ## Biostrings 2.72.1 2024-06-02 [1] Bioconductor 3.19 (R 4.4.0) ## bit 4.0.5 2022-11-15 [1] RSPM (R 4.4.0) ## bit64 4.0.5 2020-08-30 [1] RSPM (R 4.4.0) ## bitops 1.0-7 2021-04-24 [1] RSPM (R 4.4.0) ## blob 1.2.4 2023-03-17 [1] RSPM (R 4.4.0) ## bookdown 0.39 2024-04-15 [1] RSPM (R 4.4.0) ## boot 1.3-30 2024-02-26 [3] CRAN (R 4.4.0) ## broom 1.0.6 2024-05-17 [1] RSPM (R 4.4.0) ## bslib 0.7.0 2024-03-29 [2] RSPM (R 4.4.0) ## cachem 1.1.0 2024-05-16 [2] RSPM (R 4.4.0) ## caTools 1.18.2 2021-03-28 [1] RSPM (R 4.4.0) ## checkmate 2.3.1 2023-12-04 [1] RSPM (R 4.4.0) ## circlize 0.4.16 2024-02-20 [1] RSPM (R 4.4.0) ## cli 3.6.2 2023-12-11 [2] RSPM (R 4.4.0) ## clue 0.3-65 2023-09-23 [1] RSPM (R 4.4.0) ## cluster 2.1.6 2023-12-01 [3] CRAN (R 4.4.0) ## codetools 0.2-20 2024-03-31 [3] CRAN (R 4.4.0) ## colorspace 2.1-0 2023-01-23 [1] RSPM (R 4.4.0) ## colourpicker 1.3.0 2023-08-21 [1] RSPM (R 4.4.0) ## ComplexHeatmap * 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## config 0.3.2 2023-08-30 [1] RSPM (R 4.4.0) ## corpcor 1.6.10 2021-09-16 [1] RSPM (R 4.4.0) ## cowplot * 1.1.3 2024-01-22 [1] RSPM (R 4.4.0) ## crayon 1.5.2 2022-09-29 [2] RSPM (R 4.4.0) ## credentials 2.0.1 2023-09-06 [2] RSPM (R 4.4.0) ## curl 5.2.1 2024-03-01 [1] RSPM (R 4.4.0) ## data.table 1.15.4 2024-03-30 [1] RSPM (R 4.4.0) ## DBI 1.2.3 2024-06-02 [1] RSPM (R 4.4.0) ## dbplyr * 2.5.0 2024-03-19 [1] RSPM (R 4.4.0) ## DelayedArray 0.30.1 2024-05-07 [1] Bioconductor 3.19 (R 4.4.0) ## DelayedMatrixStats 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## digest 0.6.35 2024-03-11 [2] RSPM (R 4.4.0) ## doParallel 1.0.17 2022-02-07 [1] RSPM (R 4.4.0) ## dotCall64 1.1-1 2023-11-28 [1] RSPM (R 4.4.0) ## dplyr 1.1.4 2023-11-17 [1] RSPM (R 4.4.0) ## DT 0.33 2024-04-04 [1] RSPM (R 4.4.0) ## edgeR * 4.2.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## ensembldb 2.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## EnvStats 2.8.1 2023-08-22 [1] RSPM (R 4.4.0) ## evaluate 0.24.0 2024-06-10 [2] RSPM (R 4.4.0) ## ExperimentHub 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## ExploreModelMatrix * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## fANCOVA 0.6-1 2020-11-13 [1] RSPM (R 4.4.0) ## fansi 1.0.6 2023-12-08 [2] RSPM (R 4.4.0) ## fastmap 1.2.0 2024-05-15 [2] RSPM (R 4.4.0) ## fields 15.2 2023-08-17 [1] RSPM (R 4.4.0) ## filelock 1.0.3 2023-12-11 [1] RSPM (R 4.4.0) ## foreach 1.5.2 2022-02-02 [1] RSPM (R 4.4.0) ## foreign 0.8-86 2023-11-28 [3] CRAN (R 4.4.0) ## Formula 1.2-5 2023-02-24 [1] RSPM (R 4.4.0) ## fs 1.6.4 2024-04-25 [2] RSPM (R 4.4.0) ## generics 0.1.3 2022-07-05 [1] RSPM (R 4.4.0) ## GenomeInfoDb * 1.40.1 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) ## GenomeInfoDbData 1.2.12 2024-05-26 [1] Bioconductor ## GenomicAlignments 1.40.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## GenomicFeatures 1.56.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## GenomicRanges * 1.56.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## gert * 2.0.1 2023-12-04 [2] RSPM (R 4.4.0) ## GetoptLong 1.0.5 2020-12-15 [1] RSPM (R 4.4.0) ## ggbeeswarm 0.7.2 2023-04-29 [1] RSPM (R 4.4.0) ## ggplot2 * 3.5.1 2024-04-23 [1] RSPM (R 4.4.0) ## ggrepel * 0.9.5 2024-01-10 [1] RSPM (R 4.4.0) ## gh * 1.4.1 2024-03-28 [2] RSPM (R 4.4.0) ## gitcreds * 0.1.2 2022-09-08 [2] RSPM (R 4.4.0) ## GlobalOptions 0.1.2 2020-06-10 [1] RSPM (R 4.4.0) ## glue 1.7.0 2024-01-09 [2] RSPM (R 4.4.0) ## golem 0.4.1 2023-06-05 [1] RSPM (R 4.4.0) ## gplots 3.1.3.1 2024-02-02 [1] RSPM (R 4.4.0) ## gridExtra 2.3 2017-09-09 [1] RSPM (R 4.4.0) ## gtable 0.3.5 2024-04-22 [1] RSPM (R 4.4.0) ## gtools 3.9.5 2023-11-20 [1] RSPM (R 4.4.0) ## gypsum 1.0.1 2024-05-08 [1] Bioconductor 3.19 (R 4.4.0) ## HDF5Array 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## here * 1.0.1 2020-12-13 [1] RSPM (R 4.4.0) ## Hmisc * 5.1-3 2024-05-28 [1] RSPM (R 4.4.0) ## htmlTable 2.4.2 2023-10-29 [1] RSPM (R 4.4.0) ## htmltools 0.5.8.1 2024-04-04 [2] RSPM (R 4.4.0) ## htmlwidgets 1.6.4 2023-12-06 [2] RSPM (R 4.4.0) ## httpuv 1.6.15 2024-03-26 [2] RSPM (R 4.4.0) ## httr 1.4.7 2023-08-15 [2] RSPM (R 4.4.0) ## httr2 1.0.1 2024-04-01 [2] RSPM (R 4.4.0) ## igraph 2.0.3 2024-03-13 [1] RSPM (R 4.4.0) ## IRanges * 2.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## irlba 2.3.5.1 2022-10-03 [1] RSPM (R 4.4.0) ## iSEE * 2.16.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## iterators 1.0.14 2022-02-05 [1] RSPM (R 4.4.0) ## jquerylib 0.1.4 2021-04-26 [2] RSPM (R 4.4.0) ## jsonlite 1.8.8 2023-12-04 [2] RSPM (R 4.4.0) ## KEGGREST 1.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## KernSmooth 2.23-24 2024-05-17 [3] RSPM (R 4.4.0) ## knitr 1.47 2024-05-29 [2] RSPM (R 4.4.0) ## later 1.3.2 2023-12-06 [2] RSPM (R 4.4.0) ## lattice 0.22-6 2024-03-20 [3] CRAN (R 4.4.0) ## lazyeval 0.2.2 2019-03-15 [1] RSPM (R 4.4.0) ## lifecycle 1.0.4 2023-11-07 [2] RSPM (R 4.4.0) ## limma * 3.60.2 2024-05-19 [1] Bioconductor 3.19 (R 4.4.0) ## listviewer 4.0.0 2023-09-30 [1] RSPM (R 4.4.0) ## lme4 1.1-35.3 2024-04-16 [1] RSPM (R 4.4.0) ## lmerTest 3.1-3 2020-10-23 [1] RSPM (R 4.4.0) ## lobstr * 1.1.2 2022-06-22 [1] RSPM (R 4.4.0) ## locfit 1.5-9.9 2024-03-01 [1] RSPM (R 4.4.0) ## magick 2.8.3 2024-02-18 [1] RSPM (R 4.4.0) ## magrittr 2.0.3 2022-03-30 [2] RSPM (R 4.4.0) ## maps 3.4.2 2023-12-15 [1] RSPM (R 4.4.0) ## MASS 7.3-60.2 2024-05-06 [3] local ## Matrix 1.7-0 2024-03-22 [3] CRAN (R 4.4.0) ## MatrixGenerics * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## matrixStats * 1.3.0 2024-04-11 [1] RSPM (R 4.4.0) ## memoise 2.0.1 2021-11-26 [2] RSPM (R 4.4.0) ## mgcv 1.9-1 2023-12-21 [3] CRAN (R 4.4.0) ## mime 0.12 2021-09-28 [2] RSPM (R 4.4.0) ## miniUI 0.1.1.1 2018-05-18 [2] RSPM (R 4.4.0) ## minqa 1.2.7 2024-05-20 [1] RSPM (R 4.4.0) ## munsell 0.5.1 2024-04-01 [1] RSPM (R 4.4.0) ## mvtnorm 1.2-5 2024-05-21 [1] RSPM (R 4.4.0) ## nlme 3.1-165 2024-06-06 [3] RSPM (R 4.4.0) ## nloptr 2.0.3 2022-05-26 [1] RSPM (R 4.4.0) ## nnet 7.3-19 2023-05-03 [3] CRAN (R 4.4.0) ## numDeriv 2016.8-1.1 2019-06-06 [1] RSPM (R 4.4.0) ## openssl 2.2.0 2024-05-16 [2] RSPM (R 4.4.0) ## paletteer 1.6.0 2024-01-21 [1] RSPM (R 4.4.0) ## patchwork * 1.2.0 2024-01-08 [1] RSPM (R 4.4.0) ## pbkrtest 0.5.2 2023-01-19 [1] RSPM (R 4.4.0) ## pheatmap * 1.0.12 2019-01-04 [1] RSPM (R 4.4.0) ## pillar 1.9.0 2023-03-22 [2] RSPM (R 4.4.0) ## pkgconfig 2.0.3 2019-09-22 [2] RSPM (R 4.4.0) ## plotly 4.10.4 2024-01-13 [1] RSPM (R 4.4.0) ## plyr 1.8.9 2023-10-02 [1] RSPM (R 4.4.0) ## png 0.1-8 2022-11-29 [1] RSPM (R 4.4.0) ## Polychrome * 1.5.1 2022-05-03 [1] RSPM (R 4.4.0) ## postcards * 0.2.3 2022-01-07 [1] RSPM (R 4.4.0) ## promises 1.3.0 2024-04-05 [2] RSPM (R 4.4.0) ## ProtGenerics 1.36.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## purrr 1.0.2 2023-08-10 [2] RSPM (R 4.4.0) ## R.cache 0.16.0 2022-07-21 [1] RSPM (R 4.4.0) ## R.methodsS3 1.8.2 2022-06-13 [1] RSPM (R 4.4.0) ## R.oo 1.26.0 2024-01-24 [1] RSPM (R 4.4.0) ## R.utils 2.12.3 2023-11-18 [1] RSPM (R 4.4.0) ## R6 2.5.1 2021-08-19 [2] RSPM (R 4.4.0) ## rappdirs 0.3.3 2021-01-31 [2] RSPM (R 4.4.0) ## rbibutils 2.2.16 2023-10-25 [1] RSPM (R 4.4.0) ## RColorBrewer * 1.1-3 2022-04-03 [1] RSPM (R 4.4.0) ## Rcpp 1.0.12 2024-01-09 [2] RSPM (R 4.4.0) ## RCurl 1.98-1.14 2024-01-09 [1] RSPM (R 4.4.0) ## Rdpack 2.6 2023-11-08 [1] RSPM (R 4.4.0) ## recount3 * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## remaCor 0.0.18 2024-02-08 [1] RSPM (R 4.4.0) ## rematch2 2.1.2 2020-05-01 [2] RSPM (R 4.4.0) ## reshape2 1.4.4 2020-04-09 [1] RSPM (R 4.4.0) ## restfulr 0.0.15 2022-06-16 [1] RSPM (R 4.4.0) ## rhdf5 2.48.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## rhdf5filters 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## Rhdf5lib 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## RhpcBLASctl 0.23-42 2023-02-11 [1] RSPM (R 4.4.0) ## rintrojs 0.3.4 2024-01-11 [1] RSPM (R 4.4.0) ## rjson 0.2.21 2022-01-09 [1] RSPM (R 4.4.0) ## rlang * 1.1.4 2024-06-04 [2] RSPM (R 4.4.0) ## rmarkdown 2.27 2024-05-17 [2] RSPM (R 4.4.0) ## rpart 4.1.23 2023-12-05 [3] CRAN (R 4.4.0) ## rprojroot 2.0.4 2023-11-05 [2] RSPM (R 4.4.0) ## Rsamtools 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## RSQLite 2.3.7 2024-05-27 [1] RSPM (R 4.4.0) ## rstudioapi 0.16.0 2024-03-24 [2] RSPM (R 4.4.0) ## rsvd 1.0.5 2021-04-16 [1] RSPM (R 4.4.0) ## rtracklayer 1.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## S4Arrays 1.4.1 2024-05-20 [1] Bioconductor 3.19 (R 4.4.0) ## S4Vectors * 0.42.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## sass 0.4.9 2024-03-15 [2] RSPM (R 4.4.0) ## ScaledMatrix 1.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## scales 1.3.0 2023-11-28 [1] RSPM (R 4.4.0) ## scater * 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## scatterplot3d 0.3-44 2023-05-05 [1] RSPM (R 4.4.0) ## scRNAseq * 2.18.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) ## scuttle * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## sessioninfo * 1.2.2 2021-12-06 [2] RSPM (R 4.4.0) ## shape 1.4.6.1 2024-02-23 [1] RSPM (R 4.4.0) ## shiny 1.8.1.1 2024-04-02 [2] RSPM (R 4.4.0) ## shinyAce 0.4.2 2022-05-06 [1] RSPM (R 4.4.0) ## shinydashboard 0.7.2 2021-09-30 [1] RSPM (R 4.4.0) ## shinyjs 2.1.0 2021-12-23 [1] RSPM (R 4.4.0) ## shinyWidgets 0.8.6 2024-04-24 [1] RSPM (R 4.4.0) ## SingleCellExperiment * 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## smokingMouse * 0.99.91 2024-06-11 [1] Github (LieberInstitute/smokingMouse@96d8480) ## spam 2.10-0 2023-10-23 [1] RSPM (R 4.4.0) ## SparseArray 1.4.8 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) ## sparseMatrixStats 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## SpatialExperiment * 1.14.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## spatialLIBD * 1.16.2 2024-05-28 [1] Bioconductor 3.19 (R 4.4.0) ## statmod 1.5.0 2023-01-06 [1] RSPM (R 4.4.0) ## stringi 1.8.4 2024-05-06 [2] RSPM (R 4.4.0) ## stringr * 1.5.1 2023-11-14 [2] RSPM (R 4.4.0) ## styler 1.10.3 2024-04-07 [1] RSPM (R 4.4.0) ## SummarizedExperiment * 1.34.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) ## sys 3.4.2 2023-05-23 [2] RSPM (R 4.4.0) ## tibble 3.2.1 2023-03-20 [2] RSPM (R 4.4.0) ## tidyr 1.3.1 2024-01-24 [1] RSPM (R 4.4.0) ## tidyselect 1.2.1 2024-03-11 [1] RSPM (R 4.4.0) ## UCSC.utils 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## usethis * 2.2.3 2024-02-19 [2] RSPM (R 4.4.0) ## utf8 1.2.4 2023-10-22 [2] RSPM (R 4.4.0) ## variancePartition * 1.34.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## vctrs 0.6.5 2023-12-01 [2] RSPM (R 4.4.0) ## vipor 0.4.7 2023-12-18 [1] RSPM (R 4.4.0) ## viridis 0.6.5 2024-01-29 [1] RSPM (R 4.4.0) ## viridisLite 0.4.2 2023-05-02 [1] RSPM (R 4.4.0) ## withr 3.0.0 2024-01-16 [2] RSPM (R 4.4.0) ## xfun 0.44 2024-05-15 [2] RSPM (R 4.4.0) ## XML 3.99-0.16.1 2024-01-22 [1] RSPM (R 4.4.0) ## xtable 1.8-4 2019-04-21 [2] RSPM (R 4.4.0) ## XVector 0.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## yaml 2.3.8 2023-12-11 [2] RSPM (R 4.4.0) ## zlibbioc 1.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) ## ## [1] /__w/_temp/Library ## [2] /usr/local/lib/R/site-library ## [3] /usr/local/lib/R/library ## ## ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── proc.time() ## user system elapsed ## 14.294 0.998 15.070 System curl version: curl::curl_version() ## $version ## [1] "7.81.0" ## ## $ssl_version ## [1] "OpenSSL/3.0.2" ## ## $libz_version ## [1] "1.2.11" ## ## $libssh_version ## [1] "libssh/0.9.6/openssl/zlib" ## ## $libidn_version ## [1] "2.3.2" ## ## $host ## [1] "x86_64-pc-linux-gnu" ## ## $protocols ## [1] "dict" "file" "ftp" "ftps" "gopher" "gophers" "http" "https" "imap" "imaps" "ldap" ## [12] "ldaps" "mqtt" "pop3" "pop3s" "rtmp" "rtsp" "scp" "sftp" "smb" "smbs" "smtp" ## [23] "smtps" "telnet" "tftp" ## ## $ipv6 ## [1] TRUE ## ## $http2 ## [1] TRUE ## ## $idn ## [1] TRUE This interactive book was last updated at 2024-06-11 10:55:00.904124. License This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. "],["summarizedexperiment-overview.html", "1 SummarizedExperiment overview 1.1 Overview 1.2 Exercises 1.3 Solutions", " 1 SummarizedExperiment overview Instructor: Leo LIBD rstats club notes 1.1 Overview The SummarizedExperiment class is used to store experimental results in the form of matrixes. Objects of this class include observations (features) of the samples, as well as additional metadata. Usually, this type of object is automatically generated as the output of other software (ie. SPEAQeasy), but you can also build them. One of the main characteristics of SummarizedExperiment is that it allows you to handle you data in a “coordinated” way. For example, if you want to subset your data, with SummarizedExperiment you can do so without worrying about keeping your assays and metadata synched. 1.2 Exercises We are gonna use the sample data set from the airway library. library("SummarizedExperiment") library("airway") data(airway, package = "airway") se <- airway p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise 1: a) How many genes do we have in this object? And samples? b) How many samples come from donors treated (trt) with dexamethasone (dex)? ## For a) you could only print the summary of the object but since the idea is ## to understand how to explore the object find other function that gives ## you the answer. se #> class: RangedSummarizedExperiment #> dim: 63677 8 #> metadata(1): '' #> assays(1): counts #> rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492 ENSG00000273493 #> rowData names(10): gene_id gene_name ... seq_coord_system symbol #> colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521 #> colData names(9): SampleName cell ... Sample BioSample ## Same thing for b, you could just print the colData and count the samples, ## but this is not efficient when our data consists in hundreds of samples. ## Find the answer using other tools. colData(se) #> DataFrame with 8 rows and 9 columns #> SampleName cell dex albut Run avgLength Experiment Sample BioSample #> <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor> <factor> #> SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345 SRS508568 SAMN02422669 #> SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675 #> SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349 SRS508571 SAMN02422678 #> SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670 #> [ reached getOption("max.print") -- omitted 4 rows ] Exercise 2: Add another assay that has the log10 of your original counts ## In our object, if you look at the part that says assays, we can see that ## at the moment we only have one with the name "counts". se #> class: RangedSummarizedExperiment #> dim: 63677 8 #> metadata(1): '' #> assays(1): counts #> rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492 ENSG00000273493 #> rowData names(10): gene_id gene_name ... seq_coord_system symbol #> colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521 #> colData names(9): SampleName cell ... Sample BioSample ## To see the data that's stored in that assay you can do either one of the ## next commands. assay(se) #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] assays(se)$counts #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] ## Note that assay() does not support $ operator # assay(se)$counts ## We would have to do: assay(se, 1) #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] assay(se, "counts") #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> ENSG00000000003 679 448 873 408 1138 1047 770 572 #> ENSG00000000005 0 0 0 0 0 0 0 0 #> ENSG00000000419 467 515 621 365 587 799 417 508 #> ENSG00000000457 260 211 263 164 245 331 233 229 #> ENSG00000000460 60 55 40 35 78 63 76 60 #> ENSG00000000938 0 0 2 0 1 0 0 0 #> [ reached getOption("max.print") -- omitted 63671 rows ] ## If you use assays() without specifying the element you want to see it ## shows you the length of the list and the name of each element. assays(se) #> List of length 1 #> names(1): counts ## To obtain a list of names as a vector you can use: assayNames(se) #> [1] "counts" ## Which can also be use to change the name of the assays assayNames(se)[1] <- "foo" assayNames(se) #> [1] "foo" assayNames(se)[1] <- "counts" Exercise 3: Explore the metadata and add a new column that has the library size of each sample. ## To calculate the library size use apply(assay(se), 2, sum) #> SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521 #> 20637971 18809481 25348649 15163415 24448408 30818215 19126151 21164133 1.3 Solutions p.solution { background-color: #C093D6; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Solution 1: ## For a), dim() gives the desired answer dim(se) #> [1] 63677 8 ## For b), colData(se)[colData(se)$dex == "trt", ] #> DataFrame with 4 rows and 9 columns #> SampleName cell dex albut Run avgLength Experiment Sample BioSample #> <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor> <factor> #> SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675 #> SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670 #> SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354 SRS508576 SAMN02422673 #> SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358 SRS508580 SAMN02422677 colData(se)[se$dex == "trt", ] #> DataFrame with 4 rows and 9 columns #> SampleName cell dex albut Run avgLength Experiment Sample BioSample #> <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor> <factor> #> SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675 #> SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670 #> SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354 SRS508576 SAMN02422673 #> SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358 SRS508580 SAMN02422677 Solution 2: ## There are multiple ways to do it assay(se, "logcounts") <- log10(assay(se, "counts")) assays(se)$logcounts_v2 <- log10(assays(se)$counts) Solution 3: ## To add the library size we an use.. colData(se)$library_size <- apply(assay(se), 2, sum) names(colData(se)) #> [1] "SampleName" "cell" "dex" "albut" "Run" "avgLength" "Experiment" #> [8] "Sample" "BioSample" "library_size" "],["interactive-summarizedexperiment-visualizations.html", "2 Interactive SummarizedExperiment visualizations 2.1 Classes for iSEE 2.2 Getting Started with iSEE 2.3 Description of the user interface 2.4 Let’s practice! 2.5 Introduction to Advanced iSEE Features 2.6 References 2.7 Community", " 2 Interactive SummarizedExperiment visualizations Instructor: Melissa Mayén Quiroz How can you make plots from “SummarizedExperiment” objects without having to write any code? The answer is with “iSEE” http://bioconductor.org/packages/iSEE http://bioconductor.org/packages/release/bioc/vignettes/iSEE/inst/doc/basic.html iSEE is a Bioconductor package that provides an interactive Shiny-based graphical user interface for exploring data stored in SummarizedExperiment objects (Rue-Albrecht et al. 2018). 2.1 Classes for iSEE SummarizedExperiment (SE) and SingleCellExperiment (SCE) are classes in R. Classes serve as templates for creating objects that contain data and methods for manipulating those data. 2.1.1 SummarizedExperiment class Assay Data: The primary data matrix containing quantitative measurements, such as gene expression values or read counts. Rows represent features (e.g., genes, transcripts) and columns represent samples (e.g., experimental conditions, individuals). Row Metadata (rowData): Additional information about the features in the assay data. This can include annotations, identifiers, genomic coordinates, and other relevant information. Column Metadata (colData): Additional information about the samples in the assay data. This can include sample annotations, experimental conditions, treatment groups, and other relevant information. metadata: Additional information about the experiment. 2.1.2 SingleCellExperiment This object is specifically designed to store and analyze single-cell RNA sequencing (scRNA-seq) data. It extends the SummarizedExperiment class to include specialized features for single-cell data, such as cell identifiers, dimensionality reduction results, and methods for quality control and normalization. Assay Data: The primary data matrix containing gene expression values or other measurements. Rows represent genes and columns represent cells. colData (Column Metadata): Additional information about each cell, such as cell type, experimental condition, or any other relevant metadata. rowData (Row Metadata): Additional information about each gene, such as gene symbols, genomic coordinates, or functional annotations. reducedDims: Dimensionality reduction results, such as “principal component analysis” (PCA), “t-distributed stochastic neighbor embedding” (t-SNE), and “Uniform Manifold Approximation and Projection” (UMAP), used for visualizing and clustering cells. altExpNames and altExps: Names of alternative experiments (such as spike-in control genes used for normalization) and alternative experiment counts matrices. metadata: Additional metadata about the experiment. 2.1.3 SpatialExperiment This object extends the SingleCellExperiment class and is designed to store and analyze spatially-resolved transcriptomics data. Spatial transcriptomics combines gene expression data with spatial information, providing insights into the spatial organization of tissues. Assay Data: The primary data matrix containing gene expression values or other measurements. Rows represent genes and columns represent spatial spots or pixels. colData (Column Metadata): Additional information about each spatial spot or pixel, such as spatial coordinates, tissue section, or any other relevant metadata. rowData (Row Metadata): Additional information about each gene, such as gene symbols, genomic coordinates, or functional annotations. spatialCoords: A matrix or data frame containing the spatial coordinates (e.g., x and y coordinates) of each spot or pixel, which is crucial for spatial analyses and visualization. imgData: Links to image data associated with the spatial transcriptomics experiment, such as histology images or microscopy images, which provide the spatial context for the transcriptomics data. reducedDims: Dimensionality reduction results for visualizing and clustering spatial spots or pixels, similar to the SingleCellExperiment class. metadata: Additional metadata about the experiment. 2.2 Getting Started with iSEE Reference manual Adapted from The iSEE User’s Guide Installation (R version “4.4”). In this case, the package is already installed so we just need to load it. # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("iSEE") packageVersion("iSEE") #> [1] '2.16.0' library("iSEE") Documentation browseVignettes("iSEE") Use (simple launch): If you have a SummarizedExperiment object (se) or an instance of a subclass, like a SingleCellExperiment object (sce), you can launch an iSEE app by running: ## Launch iSEE for the se ("SummarizedExperiment" object) iSEE(se) ## Launch iSEE for the sce ("SingleCellExperiment" object) iSEE(sce) 2.3 Description of the user interface By default, the app starts with a dashboard that contains one panel or table of each type. By opening the collapsible panels named “Data parameters”, “Visual parameters”, and “Selection parameters” under each plot, we can control the content and appearance of each panel. Introductory tour: In the upper right corner there is a question mark icon ❓. Clicking it and then on the hand button you can have an introductory tour. During this tour, you will be taken through the different components of the iSEE user interface and learn the basic usage mechanisms by doing small actions guided by the tutorial: the highlighted elements will be responding to your actions, while the rest of the UI will be shaded. 2.3.1 Header The layout of the iSEE user interface uses the shinydashboard package. The dashboard header contains four dropdown menus. The “Organization” menu, which is identified by an icon displaying multiple windows “Export” dropdown menu, which is identified by a download icon The “Documentation” dropdown menu which is identified by a question mark icon ❓ The “Additional Information” dropdown menu which is identified by the information icon ℹ️ 2.3.1.1 Organization menu The “Organization” dropdown menu, dentified by an icon displaying multiple windows, includes: “Organize panels” button opens a modal window that contains: A selectize input to add, remove, and reorder panels in the main interface. Two inputs to control the width and height, respectively, of each panel selected above. The “Examine panel chart” feature, identified by a chain icon, allows you to visualize the relationships and point selections among your visible plot and table panels. Each panel is represented by a node, color-coded to match the app. (This functionality is particularly useful in sessions with many panels, helping you to see the structure of how panels send and receive data point selections). 2.3.1.2 Export dropdown menu The “Export” dropdown menu, identified by a download icon, includes: The “Download panel output” feature that allows you to download a zip folder containing the currently displayed panel content, including high-resolution figures and table contents as CSV files. The “Extract the R code” feature which provides a way to record the exact code that reproduces the current state of each plot. Clicking on this button opens a popup window with a text editor displaying the formatted code with syntax highlighting. You can copy this code, including initial lines and sessionInfo() commands, to your clipboard for inclusion in your analysis report or script. This code can then be further edited for publication. “Display panel settings” lets you export the code defining the current state of the panels in the interface. This is useful for pre-configuring an iSEE instance to start in the current state rather than with the default set of panels. 2.3.1.3 Documentation Menu The “Documentation” dropdown, accessible through the question mark icon ❓, includes: Interactive Tour: Launches a guided tour of iSEE, teaching basic usage interactively. Open Vignette: Displays the iSEE vignette, either locally or from the Bioconductor project site. 2.3.1.4 Additional Information Menu The “Additional Information” dropdown, accessible through the information icon ℹ️, includes: About this Session: Shows the output of the sessionInfo() function in a popup. About iSEE: Provides information on the development team, licensing, citation, and links to the GitHub repository for following development and contributing suggestions. 2.3.2 Panel types The main element in the body of iSEE is the combination of panels, generated (and optionally linked to one another) according to your actions. There are currently eight standard panel types that can be generated with iSEE: Reduced dimension plot Column data table Column data plot Feature assay plot Row data table Row data plot Sample assay plot Complex heatmap In addition, custom panel types can be defined. 2.3.3 Parameter sets For each standard plot panel, three different sets of parameters will be available in collapsible boxes: “Data parameters”, to control parameters specific to each type of plot. “Visual parameters”, to specify parameter s that will determine the aspect of the plot, in terms of coloring, point features, and more (e.g., legend placement, font size). “Selection parameters” to control the incoming point selection and link relationships to other plots. 2.3.4 Reduced dimension plots If a SingleCellExperiment object is supplied to the iSEE::iSEE() function, reduced dimension results are extracted from the reducedDim slot. Examples include low-dimensional embeddings from principal components analysis (PCA) or t-distributed stochastic neighbour embedding (t-SNE). These results are used to construct a two-dimensional Reduced dimension plot where each point is a sample, to facilitate efficient exploration of high-dimensional datasets. The “Data parameters” control the reducedDim slot to be displayed, as well as the two dimensions to plot against each other. Note that this built in panel does not compute reduced dimension embeddings; they must be precomputed and available in the object provided to the iSEE() function. Nevertheless, custom panels - such as the iSEE DynamicReducedDimensionPlot can be developed and used to enable such features. 2.3.5 Column data plots A Column data plot visualizes sample metadata contained in column metadata. Different fields can be used for the x- and y-axes by selecting appropriate values in the “Data parameters” box. This plot can assume various forms, depending on the nature of the data on the x- and y-axes: If the y-axis is continuous and the x-axis is categorical, violin plots are generated (grouped by the x-axis factor). If the y-axis is categorical and the x-axis is continuous, horizontal violin plots are generated (grouped by the y-axis factor). If both axes are continuous, a scatter plot is generated. This enables the use of contours that are overlaid on top of the plot, check the “Other” box to see the available options. If both axes are categorical, a plot of squares (Hinton plot) is generated where the area of each square is proportional to the number of samples within each combination of factor levels. 2.3.6 Feature assay plots A Feature assay plot visualizes the assayed values (e.g., gene expression) for a particular feature (e.g., gene) across the samples on the y-axis. This usually results in a (grouped) violin plot, if the x-axis is set to “None” or a categorical variable; or a scatter plot, if the x-axis is another continuous variable. Gene selection for the y-axis can be achieved by using a linked row data table in another panel. Clicking on a row in the table automatically changes the assayed values plotted on the y-axis. Alternatively, the row name can be directly entered as text that corresponds to an entry of rownames(se). (This is not effective if se does not contain row names.) 2.3.7 Row data plots A Row data plot allows the visualization of information stored in the rowData slot of a “SummarizedExperiment” object. Its behavior mirrors the implementation for the Column data plot, and correspondingly this plot can assume various forms depending on whether the data are categorical or continuous. 2.3.8 Sample assay plots A Sample assay plot visualizes the assayed values (e.g., gene expression) for a particular sample (e.g., cell) across the features on the y-axis. This usually results in a (grouped) violin plot, if the x-axis is set to “None” or a categorical variable (e.g., gene biotype); or a scatter plot, if the x-axis is another continuous variable. Notably, the x-axis covariate can also be set to: A discrete row data covariates (e.g., gene biotype), to stratify the distribution of assayed values A continuous row data covariate (e.g., count of cells expressing each gene) Another sample, to visualize and compare the assayed values in any two samples. 2.3.9 Row data tables A Row data table contains the values of the rowData slot. If none are available, a column named Present is added and set to TRUE for all features, to avoid issues with DT::datatable() and an empty DataFrame. Typically, these tables are used to link to other plots to determine the features to use for plotting or coloring. 2.3.10 Column data tables A Column data table contains the values of the colData slot. Its behavior mirrors the implementation for the Row data table. Correspondingly, if none are available, a column named Present is added and set to TRUE for all samples. Typically, these tables are used to link to other plots to determine the samples to use for plotting or coloring. 2.3.11 Heat maps Heat map panels provide a compact overview of the data for multiple features in the form of color-coded matrices. These correspond to the assays stored in the SCE/SE object, where features (e.g., genes) are the rows and samples are the columns. User can select features (rows) to display from the selectize widget (which supports autocompletion), or also via other panels, like row data plots or row data tables. In addition, users can rapidly import custom lists of feature names using a modal popup that provides an Ace editor where they can directly type of paste feature names, and a file upload button that accepts text files containing one feature name per line. Users should remember to click the “Apply” button before closing the modal, to update the heat map with the new list of features. The “Suggest feature order” button clusters the rows, and also rearranges the elements in the selectize according to the clustering. It is also possible to choose which assay type is displayed (\"logcounts\" being the default choice, if available). Samples in the heat map can also be annotated, simply by selecting relevant column metadata. A zooming functionality is also available, restricted to the y-axis (i.e., allowing closer inspection on the individual features included). 2.3.12 Description of iSEE functionality 2.3.12.1 Coloring plots by sample attributes 2.3.12.1.1 Column-based plots Column-based plots are: reduced dimension feature assay column data plots Where each data point represents a sample. Here, data points can be colored in different ways: The default is no color scheme (“None” in the radio button). Any column of colData(se) can be used. The plot automatically adjusts the scale to use based on whether the chosen column is continuous or categorical. The assay values of a particular feature in each sample can be used. The feature can be chosen either via a linked row table or selectize input (as described for the Feature assay plot panel). Users can also specify the assays from which values are extracted. The identity of a particular sample can be used, which will be highlighted on the plot in a user-specified color. The sample can be chosen either via a linked column table or via a selectize input. 2.3.12.1.2 Row-based plots For row-based plots (i.e., the sample assay and row data plots), each data point represents a feature. Like the column-based plots, data points can be colored by: “None”, yielding data points of fixed color. Any column of rowData(se). The identity of a particular feature, which is highlighted in the user-specified color. Assay values for a particular sample. 2.3.12.2 Controlling point aesthetics Data points can be set to different shapes according to categorical factors in colData(se) (for column-based plots) or rowData(se) (for row-based plots). This is achieved by checking the “Shape” box to reveal the shape-setting options. The size and opacity of the data points can be modified via the options available by checking the “Point” box. This may be useful for aesthetically pleasing visualizations when the number of points is very large or small. 2.3.12.3 Faceting Each point-based plot can be split into multiple facets using the options in the “Facet” checkbox. Users can facet by row and/or column, using categorical factors in colData(se) (for column-based plots) or rowData(se) (for row-based plots). This provides a convenient way to stratify points in a single plot by multiple factors of interest. Note that point selection can only occur within a single facet at a time; points cannot be selected across facets. 2.3.12.4 Zooming in and out Zooming in is possible by first selecting a region of interest in a plot using the brush (drag and select); double-clicking on the brushed area then zooms into the selected area. To zoom out to the original plot, simply double-click at any location in the plot. 2.4 Let’s practice! 2.4.1 Setting up the data We’ll download a SingleCellExperiment object, which is similar to SummarizedExperiment as it extends it. http://bioconductor.org/packages/SingleCellExperiment http://bioconductor.org/packages/spatialLIBD https://doi.org/10.1038/s41593-020-00787-0 https://osca.bioconductor.org/ https://www.nature.com/articles/s41592-019-0654-x Figures 2 and 3 ## Lets get some data using spatialLIBD sce_layer <- spatialLIBD::fetch_data("sce_layer") #> adding rname 'https://www.dropbox.com/s/bg8xwysh2vnjwvg/Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata?dl=1' #> 2024-06-11 10:55:10.106743 loading file /github/home/.cache/R/BiocFileCache/4764df94b32_Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata%3Fdl%3D1 sce_layer #> class: SingleCellExperiment #> dim: 22331 76 #> metadata(0): #> assays(2): counts logcounts #> rownames(22331): ENSG00000243485 ENSG00000238009 ... ENSG00000278384 ENSG00000271254 #> rowData names(10): source type ... is_top_hvg is_top_hvg_sce_layer #> colnames(76): 151507_Layer1 151507_Layer2 ... 151676_Layer6 151676_WM #> colData names(13): sample_name layer_guess ... layer_guess_reordered_short spatialLIBD #> reducedDimNames(6): PCA TSNE_perplexity5 ... UMAP_neighbors15 PCAsub #> mainExpName: NULL #> altExpNames(0): ## We can check how big the object is with lobstr lobstr::obj_size(sce_layer) #> 33.99 MB NOTE: if you run into this error: Error in `BiocFileCache::bfcrpath()`: ! not all 'rnames' found or unique. Backtrace: 1. spatialLIBD::fetch_data("sce_layer") 3. BiocFileCache::bfcrpath(bfc, url) check the output of curl::curl_version()$version #> [1] "7.81.0" If it’s version 8.6.0, you likely need to upgrade to version 8.8.0. For macOS users, you can do this via Homebrew with ## Install homebrew from https://brew.sh/ brew install curl then install curl from source with: Sys.setenv(PKG_CONFIG_PATH = "/opt/homebrew/opt/curl/lib/pkgconfig") install.packages("curl", type = "source") For all the gory details, check https://github.com/curl/curl/issues/13725, https://github.com/Bioconductor/BiocFileCache/issues/48, and related issues. As a workaround, you could also run this: tmp_sce_layer <- tempfile("sce_layer.RData") download.file( "https://www.dropbox.com/s/bg8xwysh2vnjwvg/Human_DLPFC_Visium_processedData_sce_scran_sce_layer_spatialLIBD.Rdata?dl=1", tmp_sce_layer, mode = "wb" ) load(tmp_sce_layer, verbose = TRUE) #> Loading objects: #> sce_layer sce_layer #> class: SingleCellExperiment #> dim: 22331 76 #> metadata(0): #> assays(2): counts logcounts #> rownames(22331): ENSG00000243485 ENSG00000238009 ... ENSG00000278384 ENSG00000271254 #> rowData names(10): source type ... is_top_hvg is_top_hvg_sce_layer #> colnames(76): 151507_Layer1 151507_Layer2 ... 151676_Layer6 151676_WM #> colData names(12): sample_name layer_guess ... layer_guess_reordered layer_guess_reordered_short #> reducedDimNames(6): PCA TSNE_perplexity5 ... UMAP_neighbors15 PCAsub #> mainExpName: NULL #> altExpNames(0): 2.4.2 Explore the Data Now we can deploy iSEE() to explore the data. ## Load library library("iSEE") ## Deploy iSEE(sce_layer) p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Question 1: Which panel Type is displaying the following plot? Exercise 1: Recreate the following plot. Question 2: What is different between this 2 plots? Exercise 2: Recreate the following plot. Question 3: What is different between this 2 plots? Exercise 3: Recreate the following plot Ensembl IDs: ENSG00000177757 ENSG00000237491 ENSG00000238009 ENSG00000243485 Exercise 4: Recreate the following plot. What would you change from the last one? Ensembl IDs: ENSG00000177757 ENSG00000237491 ENSG00000238009 ENSG00000243485 Exercise 5: Recreate the following plot. What would you change from the last one? Ensembl IDs: ENSG00000177757 ENSG00000237491 ENSG00000238009 ENSG00000243485 Exercise 6: Download only the last plot (Final HeatMap) Exercise 7: Extract the R code only for the last plot (Final HeatMap) 2.5 Introduction to Advanced iSEE Features Adapted from the GitHub Issue: https://github.com/iSEE/iSEE/issues/650 Beyond its basic functionalities, iSEE offers advanced features that allow users to perform complex data manipulations interactively. This includes the ability to subset and filter cells based on gene expression criteria. To begin with, we will load the necessary libraries and dataset. In this case we will be using ReprocessedAllenData from the scRNAseq package, a dataset of 379 mouse brain cells from Tasic et al. (2016). After loading the dataset, we normalize the counts and perform a PCA (Principal Component Analysis) to prepare the data for visualization. library("scRNAseq") library("scater") library("iSEE") # Load the dataset sce <- ReprocessedAllenData(assays = "tophat_counts") # Normalize counts and perform PCA sce <- logNormCounts(sce, exprs_values = "tophat_counts") sce <- runPCA(sce, ncomponents = 4) 2.5.1 Selecting Cells Based on a Single Gene Expression To select cells based on the expression of a single gene using iSEE, we need to create an initial list of panels that will be displayed when we launch iSEE. The first panel in our list is a “FeatureAssayPlot”, which will show the expression levels of the gene “Serpine2”. By visualizing this plot, we can interactively select cells that express “Serpine2”. To complement this, we add a “ReducedDimensionPlot” to our panel list. This plot will visualize the PCA and highlight the cells that we selected based on “Serpine2” expression. The linkage between these two panels allows us to see how the selected cells are distributed in the reduced dimensional space (PCA). ## Initial settings for a single gene expression initial_single <- list( FeatureAssayPlot(Assay = "logcounts", YAxisFeatureName = "Serpine2"), ReducedDimensionPlot(Type = "PCA", ColorBy = "Column selection", ColumnSelectionSource = "FeatureAssayPlot1") ) ## Launch iSEE with the initial settings if (interactive()) { iSEE(sce, initial = initial_single) } 2.5.2 Using a Single Plot for Two Gene Co-Expression To select cells based on the expression of two gene, we can use a single “FeatureAssayPlot” panel. In this setup, one gene is plotted on the x-axis and the other gene on the y-axis. This method allows us to directly visualize and select cells that express both genes simultaneously. By adding a “ReducedDimensionPlot” to our initial panel list, we can again see how these selected cells are distributed in the PCA plot. This approach is simpler when dealing with only two genes and provides an intuitive way to explore co-expression patterns. ## Initial settings for 2 genes expression on the same "FeatureAssayPlot" initial_combined <- list( FeatureAssayPlot(Assay = "logcounts", XAxis = "Feature name", XAxisFeatureName = "Serpine2", YAxisFeatureName = "Bcl6"), ReducedDimensionPlot(Type = "PCA", ColorBy = "Column selection", ColumnSelectionSource = "FeatureAssayPlot1") ) ## Launch iSEE with the initial settings if (interactive()) { iSEE(sce, initial = initial_combined) } 2.5.3 Selecting Cells Based on the Co-Expression of Two or more Genes In situations where we want to select cells based on the expression of two or more genes, we need to chain multiple “FeatureAssayPlot” panels together. For instance, if we are interested in cells that express both “Serpine2” and “Bcl6”, we start by creating a “FeatureAssayPlot” for “Serpine2”. Then, we add another “FeatureAssayPlot” for “Bcl6”, but this time we specify that the selection source for this plot is the “FeatureAssayPlot” for “Serpine2”. This setup ensures that only cells that were selected in the first plot (based on “Serpine2”) are displayed in the second plot (for” Bcl6”). Finally, we include a ReducedDimensionPlot to visualize the PCA, highlighting the cells that meet both criteria. This chained selection process allows for more refined filtering based on multiple gene expressions. ## Initial settings chainning multiple "FeatureAssayPlot" initial_double <- list( FeatureAssayPlot(Assay = "logcounts", YAxisFeatureName = "Serpine2"), FeatureAssayPlot(Assay = "logcounts", YAxisFeatureName = "Bcl6", ColumnSelectionSource = "FeatureAssayPlot1", ColumnSelectionRestrict = TRUE), ReducedDimensionPlot(Type = "PCA", ColorBy = "Column selection", ColumnSelectionSource = "FeatureAssayPlot2") ) ## Launch iSEE with the initial settings if (interactive()) { iSEE(sce, initial = initial_double) } 2.6 References https://www.bioconductor.org/packages/release/bioc/html/iSEE.html https://github.com/iSEE/iSEE https://shiny.posit.co/r/gallery/life-sciences/isee/ https://bioconductor.org/packages/release/bioc/vignettes/iSEE/inst/doc/basic.html https://github.com/iSEE/iSEE/issues/650 2.7 Community iSEE authors: Kévin Rue-Albrecht https://twitter.com/KevinRUE67 Federico Marini https://twitter.com/FedeBioinfo Charlotte Soneson https://bsky.app/profile/csoneson.bsky.social Aaron Lun https://twitter.com/realAaronLun "],["recount3-introduction.html", "3 recount3 introduction 3.1 recount projects 3.2 Using recount3 3.3 Exercise 3.4 Community", " 3 recount3 introduction Instructor: Leo Don’t let useful data go to waste by Franziska Denk https://doi.org/10.1038/543007a 3.1 recount projects ReCount: data from 20 studies http://bowtie-bio.sourceforge.net/recount/index.shtml Paper from 2011 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-449 recount: over 70k human bulk RNA-seq samples uniformly processed https://jhubiostatistics.shinyapps.io/recount/ pkgdown documentation website: http://leekgroup.github.io/recount/ Bioconductor documentation website: http://bioconductor.org/packages/recount Main paper (2017) http://www.nature.com/nbt/journal/v35/n4/full/nbt.3838.html Paper that explains why the counts are different from the usual ones https://f1000research.com/articles/6-1558/v1 Example analyses we did and provided as a companion website for the 2017 paper http://leekgroup.github.io/recount-analyses/ recount3: over 700k bulk RNA-seq samples from human and mouse http://rna.recount.bio/ pkgdown documentation website: http://research.libd.org/recount3/ Bioconductor documentation website: http://bioconductor.org/packages/recount3 Pre-print: May 2021 https://doi.org/10.1101/2021.05.21.445138 Paper: November 2021 https://doi.org/10.1186/s13059-021-02533-6 These projects help such that anyone, particularly those without access to a high performance computing (HPC) system (aka a compute cluster), can access these datasets. It’s like democratizing access to the gene expression data ^^. 3.2 Using recount3 Check the original documentation here and here. Let’s first load recount3 which will load all the required dependencies including SummarizedExperiment. ## Load recount3 R package library("recount3") Next we need to identify a study of interest as well as choose whether we want the data at the gene, exon, or some other feature level. Once we have identified our study of interest, we can download the files and build a SummarizedExperiment object using recount3::create_rse() as we’ll show next. create_rse() has arguments through which we can control what annotation we want to use (they are organism-dependent). ## Lets download all the available projects human_projects <- available_projects() #> 2024-06-11 10:55:30.995138 caching file sra.recount_project.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/sra.recount_project.MD.gz' #> 2024-06-11 10:55:32.834054 caching file gtex.recount_project.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/gtex/metadata/gtex.recount_project.MD.gz' #> 2024-06-11 10:55:34.303925 caching file tcga.recount_project.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/tcga/metadata/tcga.recount_project.MD.gz' ## Find your project of interest. Here we'll use ## SRP009615 as an example proj_info <- subset( human_projects, project == "SRP009615" & project_type == "data_sources" ) ## Build a RangedSummarizedExperiment (RSE) object ## with the information at the gene level rse_gene_SRP009615 <- create_rse(proj_info) #> 2024-06-11 10:55:37.221981 downloading and reading the metadata. #> 2024-06-11 10:55:37.724484 caching file sra.sra.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.sra.SRP009615.MD.gz' #> 2024-06-11 10:55:39.000263 caching file sra.recount_project.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_project.SRP009615.MD.gz' #> 2024-06-11 10:55:40.337901 caching file sra.recount_qc.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_qc.SRP009615.MD.gz' #> 2024-06-11 10:55:41.57176 caching file sra.recount_seq_qc.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_seq_qc.SRP009615.MD.gz' #> 2024-06-11 10:55:42.992307 caching file sra.recount_pred.SRP009615.MD.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/metadata/15/SRP009615/sra.recount_pred.SRP009615.MD.gz' #> 2024-06-11 10:55:43.849074 downloading and reading the feature information. #> 2024-06-11 10:55:44.457463 caching file human.gene_sums.G026.gtf.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/annotations/gene_sums/human.gene_sums.G026.gtf.gz' #> 2024-06-11 10:55:45.823854 downloading and reading the counts: 12 samples across 63856 features. #> 2024-06-11 10:55:46.293574 caching file sra.gene_sums.SRP009615.G026.gz. #> adding rname 'http://duffel.rail.bio/recount3/human/data_sources/sra/gene_sums/15/SRP009615/sra.gene_sums.SRP009615.G026.gz' #> 2024-06-11 10:55:47.329599 constructing the RangedSummarizedExperiment (rse) object. ## Explore the resulting object rse_gene_SRP009615 #> class: RangedSummarizedExperiment #> dim: 63856 12 #> metadata(8): time_created recount3_version ... annotation recount3_url #> assays(1): raw_counts #> rownames(63856): ENSG00000278704.1 ENSG00000277400.1 ... ENSG00000182484.15_PAR_Y ENSG00000227159.8_PAR_Y #> rowData names(10): source type ... havana_gene tag #> colnames(12): SRR387777 SRR387778 ... SRR389077 SRR389078 #> colData names(175): rail_id external_id ... recount_pred.curated.cell_line BigWigURL ## How large is it? lobstr::obj_size(rse_gene_SRP009615) #> 24.81 MB We can also interactively choose our study of interest using the following code or through the recount3 study explorer. ## Explore available human projects interactively proj_info_interactive <- interactiveDisplayBase::display(human_projects) ## Choose only 1 row in the table, then click on "send". ## Lets double check that you indeed selected only 1 row in the table stopifnot(nrow(proj_info_interactive) == 1) ## Now we can build the RSE object rse_gene_interactive <- create_rse(proj_info_interactive) Now that we have the data, we can use recount3::transform_counts() or recount3::compute_read_counts() to convert the raw counts into a format expected by downstream tools. For more details, check the recountWorkflow paper. ## We'll compute read counts, which is what most downstream software ## uses. ## For other types of transformations such as RPKM and TPM, use ## transform_counts(). assay(rse_gene_SRP009615, "counts") <- compute_read_counts(rse_gene_SRP009615) ## Lets make it easier to use the information available for this study ## that was provided by the original authors of the study. rse_gene_SRP009615 <- expand_sra_attributes(rse_gene_SRP009615) colData(rse_gene_SRP009615)[ , grepl("^sra_attribute", colnames(colData(rse_gene_SRP009615))) ] #> DataFrame with 12 rows and 4 columns #> sra_attribute.cells sra_attribute.shRNA_expression sra_attribute.source_name sra_attribute.treatment #> <character> <character> <character> <character> #> SRR387777 K562 no SL2933 Puromycin #> SRR387778 K562 yes, targeting SRF SL2934 Puromycin, doxycycline #> SRR387779 K562 no SL5265 Puromycin #> SRR387780 K562 yes targeting SRF SL3141 Puromycin, doxycycline #> SRR389079 K562 no shRNA expression SL6485 Puromycin #> ... ... ... ... ... #> SRR389082 K562 expressing shRNA tar.. SL2592 Puromycin, doxycycline #> SRR389083 K562 no shRNA expression SL4337 Puromycin #> SRR389084 K562 expressing shRNA tar.. SL4326 Puromycin, doxycycline #> SRR389077 K562 no shRNA expression SL1584 Puromycin #> SRR389078 K562 expressing shRNA tar.. SL1583 Puromycin, doxycycline We are now ready to use other bulk RNA-seq data analysis software tools. 3.3 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise 1: Use iSEE to reproduce the following image Hints: Use dynamic feature selection Use information from columns (samples) for the X axis Use information from columns (samples) for the colors (optional) Create your free account at https://www.shinyapps.io/ and share your iSEE app with the world. Regrettably iSEE::iSEE() will need more than the default free 1 GB RAM option available from https://www.shinyapps.io/. Real examples used on a paper: https://github.com/LieberInstitute/10xPilot_snRNAseq-human#explore-the-data-interactively. Example from another course: https://libd.shinyapps.io/SRP009615/. It was created with https://github.com/lcolladotor/rnaseq_2023_notas_en_vivo/blob/main/app.R. 3.4 Community recount2 and 3 authors on Twitter: https://twitter.com/chrisnwilks https://twitter.com/BenLangmead https://twitter.com/KasperDHansen https://bsky.app/profile/nav.bsky.social https://twitter.com/Shannon_E_Ellis https://twitter.com/jtleek More about the different types of counts: If I'm using recount2 data for a differential analysis in DEseq2, should I be using the original counts, or the scaled counts?@mikelove @lcolladotor #rstats #Bioconductor — Dr. Robert M Flight, PhD (@rmflight) January 29, 2021 Tweets from the community From a student in the LCG-UNAM 2021 course: @lcolladotor Earlier I was looking for some data to analyze in recount, they have so much, I seriously can't decide what to use! https://t.co/fIJwXq46TzThanks for such an useful package!@chrisnwilks @BenLangmead @KasperDHansen @AbhiNellore @Shannon_E_Ellis @jtleek — Axel Zagal Norman (@NormanZagal) February 25, 2021 Exploring the possibility of using recount3 data for an analysis (January 2022): I have found a novel exon expressed in a cancer sample. I would like to search TCGA/SRA to identify other samples with the same/similar exon. It will be rare. Can I use Recount3, megadepth for this? @jtleek @lcolladotor @BenLangmead — Alicia Oshlack (@AliciaOshlack) January 5, 2022 Others discussing meta analyses publicly on Twitter: Thinking on this a bit it is strange how few people are doing “medium-sized” meta analyses of transcriptiomics. One on end you have @BenLangmead @lcolladotor reprocessing (with a touch of analysis) most of SRA. And you see papers pulling an dataset or two to corroborate. — David McGaughey (@David_McGaughey) February 1, 2022 That might be a gin&tonic in my hand, but it still holds true that #recount3 is a wonderful resource and super useful in our annotation efforts! Great to meet you @lcolladotor!! https://t.co/cSCZAajhrY — GencodeGenes (@GencodeGenes) May 11, 2024 "],["differential-gene-expression-analysis-overview.html", "4 Differential Gene Expression analysis overview 4.1 Preliminary steps 4.2 Differential Gene Expression 4.3 Downstream analyses References", " 4 Differential Gene Expression analysis overview Instructor: Daianna González Padilla Differential Gene Expression (DGE) analyses are common statistical analyses of gene expression data that aim to discover genes significantly altered in their expression levels between experimental groups, which can be given by a condition, treatment, experimental procedure/exposure, diagnostic, time points, by biological origins (e.g. differences in sex, tissue, age, species), and even by different technical methodologies. These genes are known as Differentially Expressed Genes (DEGs) and can be either up- or down-regulated if their expression is greater or less in one group with respect to the other(s), respectively. Diverse methods exist to perform DGE and multiple downstream analyses can be applied on DEGs, but a series of non-skippable preliminary steps exists which are necessary to correctly perform previous to any statistical testing. Below a classic workflow for DGE is depicted. It takes as input the gene expression matrix with raw read counts for genes (as rows) across all samples (as columns). Among the preliminary steps, there is an initial data processing step encompassing count normalization and filtering of lowly-expressed genes. Secondly, Exploratory Data Analysis (EDA) involves assessment of Quality Control (QC) metrics of the samples and filtering of poor-quality ones, as well as an examination of the gene expression profiles between sample groups, potential detection of additional atypical samples to remove, and the exploration of the correlations between sample-level variables and their contributions in the expression variance of each gene to guide the covariate selection for DGE models. Figure 1: Summary of the analyses for differential expression. 1. RNA-seq data processing: raw counts are normalized and log-scaled (lognorm counts) and the lowly-expressed genes are filtered out. 2. Exploratory Data Analysis: quality metrics of the samples are compared across groups, the poor-quality samples are filtered and both sample-level and gene-level effects of sample variables are explored to identify those that are main drivers of gene expression variation to include in the models for DGE. 3. Differential Expression Analysis: under the limma-voom pipeline the expression of each gene is linearly modeled by the selected variables in the previous step; after fitting the model gene-wise log2-fold changes (log2FC) and p-values are obtained for the variable of interest and other statistics of differential expression are also computed and compared. Here DEGs are determined based on the significance threshold (controlling for the False Discovery Rate or FDR). 4. Functional Enrichment Analysis: an overrepresentation analysis (ORA) is performed to find statistically significant associations between our groups of DEGs and gene sets annotated in GO terms and KEGG pathways; here we identify biological processes, cellular functions and components, and pathways potentially affected or involved in the experimental condition under study. 5. DE visualization: heatmaps are created to visually contrast gene expression levels of DEGs in the experimental groups. Abbreviations: CPM: counts per million; QC: quality control; PC: principal component; DEG(s): differentially expressed gene(s); Ctrl: control; Expt: experimental; GO: Gene Ontology; KEGG: Kyoto Encyclopedia of Genes and Genomes. 4.1 Preliminary steps Evident computational steps right after sequencing involve raw sequencing reads Quality Control (QC) analysis and read alignment to a reference genome for the subsequent gene expression quantification, generating the input for DGE. Comprehensive pipelines have been developed for these purposes, such as the RNA-seq processing pipeline SPEAQeasy (Eagles, N.J. et al. 2021) that provides a flexible, user-friendly, and reproducible pipeline to perform all such analyses through the implementation of a single workflow, liberating from the need of performing each step individually. p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 15px; font-family: sans-serif; } 👉🏼 More details of this pipeline are provided in the original manuscript, the documentation website, and in other created resources. 4.1.1 RNA-seq data processing Once the gene expression matrix has been generated we can proceed to process the read counts. Raw counts are typically zero-enriched and not normally-distributed, opposite to what is required for the application of several statistical methods. Furthermore, raw gene expression values may reflect protocol-specific biases and biological factors other than the one of interest. Therefore raw gene expression counts must be normalized and lowly-expressed genes filtered out. 4.1.1.1 Data normalization Differences between samples such as library sizes (sequencing depths), and RNA composition, as well as different gene lengths and GC contents make raw gene expression data not comparable between samples. Several normalization strategies can be applied to surpass such differences; commonly counts-per-million (cpm) are used. In addition to count normalization, a log-transformation is required to make cpm follow an approximately normal distribution. 4.1.1.2 Gene filtering Often the expression profiling platform (microarray or RNA-seq) includes genes that do not appear to be expressed to a worthwhile degree in any or most of the samples. This might occur, for instance, in cases where genes are not expressed in any of the cell types being experimentally profiled [1]. We want to remove those genes prior to posterior analyses, which is justified on both biological and statistical grounds [2]: Biologically, it is considered that a gene must have a minimal expression level to be translated into a protein or to be of biological importance. Statistically, lowly-expressed genes are unlikely to be detected as DE because low counts don’t provide the required statistical evidence to assess differential expression. Different approaches exist for this step, ranging from the simple definition of a gene expression cutoff to those taking into account not only global gene expression but also the number of samples in which they have a minimum expression level. 👉🏼 More details about normalization and filtering can be consulted in the course material of 2023 (Statistical Analysis of Genome Scale Data 2023: Data preparation). Figure 2: RNA-seq data processing steps. 1. Count log-normalization: distribution of read counts before and after normalization and log-transformation into \\(log_2(cpm+0.5)\\) using calcNormFactors() and cpm() of edgeR. 2. Gene filtering: distribution of \\(log_2(cpm+0.5)\\) before and after filtering lowly-expressed genes; note the elimination of counts that were originally zeros. 4.1.2 Exploratory Data Analysis The Exploratory Data Analysis (EDA) is a primordial step in which, as the name refers, we explore relevant aspects of the RNA-seq data. In this process we basically create tons of plots, charts and graphs to visualize the data, assess their quality and inspect their variability. This clearly exposes low-quality samples and relationships and contributions in gene expression variance of sample-level variables. This allows to draw valuable information from our data that could impact posterior analyses, including DGE. Thus EDA guides filtering steps, the execution of additional analyses, the selection of covariates for DGE models and of statistical tests based on data features, and could also aid in more accurate interpretations of the results. Although here we describe EDA as being comprised by QCA, dimensionality reduction to explore sample-level effects, and variance partition analysis to explore gene-level effects, EDA is not a well defined process that can be followed by concrete instructions or steps. The analyses you run and what you plot depends on the particular questions you’re trying to answer, what you would like to know about your data and of course, it completely depends on the specific characteristics of your own dataset. 4.1.2.1 Quality Control Analysis (QCA) First, the quality metrics of the samples regarding read and RNA contents, and read mapping rates have to be compared to (Figure 3: step 1): Identify punctual samples or groups of samples of poor quality that may have arisen by technical causes during experimental steps. Evaluate if samples from the groups of interest for DGE (diagnostic, treatment, etc.) differ in their quality metrics as these can represent confounding factors for differential expression. Detect high biological variability to subsequently support data partition to perform subanalyses from the data. Further, we are also interested in investigating trends and relationships between sample variables to unveil underlying technical and biological aspects of the observed data (Figure 3: step 2). After having identified poor-quality samples, we have to remove them to not include the unreliable expression data they provide in downstream analyses. Cutoffs can be defined for specific QC metrics to decide which samples to keep; this however, is not strongly recommended as no consolidated references exist to define such cutoffs and therefore rather represent arbitrary values. Other approaches include identifying outlier QC metrics (Figure 3: step 3), but again, caution must be taken as outlier definition is also arbitrary and we could be discarding good-quality samples. Figure 3: Quality Control Analysis steps. 1. Evaluate QC metrics for groups of samples: sample QC metrics such as the fraction of reads that mapped to the mitochondrial chromosome (mitoRate) and to the reference genome (overallMapRate) are compared between sample groups given by the variable of interest (Group in this example), technical variables (e.g. plate for sample library preparation), and biological variables (e.g. Age). 2. Examine relationships between sample variables: pairs of QC metrics are compared; here mitoRate and the fraction of reads assigned to rRNA genes (rRNA_rate), as well as the library size (sum) and the number of expressed genes (detected) are plotted to explore the relationships they present with each other and with other sample metadata variables. (Group and Age). 3. QC-based sample filtering: outlier QC metrics (red) are detected based on +/- 3 median-absolute-deviations (MADs) away (dotted lines) from the median (solid line). 👉🏼 See more details about QCA in Statistical Analysis of Genome Scale Data 2023: Quality Control Analysis. 4.1.2.2 Exploration of sample-level effects Sample gene expression profiles can be analyzed and compared after dimensionality reduction procedures such as Principal Component Analysis (PCA) and Multidimensional-Scaling (MDS). These analyses are useful to potentially detect samples with outlier transcriptomic profiles to further remove and to identify sample variables driving gene expression variations (Figure 4). Figure 4: Exploration of sample-level effects through PCA 1. Detection of atypical samples (manual PCA-based sample filtering): PCx vs PCy plots can expose outlier samples that appear segregated from the rest (purple-squared sample) or samples of a particular group (Sex: F or M) closer to samples from the other group (blue-squared sample). These should be further examined to evalute if they can be kept or must be discarded. In this case, after removing them, PC2 that explains a higher % of variance in gene expression, separates samples by sex. 2. Identification of drivers of sample gene expression variation: reducing the dimensionality of our data enables to recognize sample variables explaining differences in the gene expression of the samples (Age), ascertain technical variables and batch effects are not impacting on the transcriptome (plate), and inquire to what extent our variable of interest is contributing to changes in gene expression (Group). 4.1.2.3 Model building: covariate selection for limma-voom DGE methods fitting linear models to gene expression data to assess if a covariate impacts significantly on the expression of a gene, require the selection of sample-level variables to model transcriptomic data. If very few variables are present, normally they are all included in the model but that’s not often the case with RNA-seq and it doesn’t represent a well founded strategy. Usually, multiple technical and biological variables are implicated in the experiments and sample QC metrics can affect the gene expression levels, even after count normalization, whereas other variables are redundant and/or minimally informative. Therefore, we’d like to identify an optimal set of variables to adjust gene expression for, in addition to the covariate of interest. We have already introduced one first approximation to that with PCA as this analysis allows us to identify variables explaining high percentages of gene expression variance between samples. In Chapter 7 we will review how correlation and variance partition analyses at the gene level can help us determine a suitable set of highly explanatory variables. 4.2 Differential Gene Expression Different mathematical and statistical approaches exist to compare gene expression between two or more conditions. In Chapter 5 we’ll briefly introduce methods based on the negative binomial distribution and address how to perform DGE under the empirical Bayes limma-voom framework, distinguishing how it operates, its main specifications, inputs, and outputs. 4.3 Downstream analyses After finding DEGs, volcano plots and heat maps are commonly used to graphically represent them, plotting relevant information about them and their expression levels, respectively. In Chapter 5 we’ll also check how to create and interpret these plots. References Smyth, G. K., Ritchie, M., Thorne, N., Wettenhall, J., Shi, W., & Hu, Y. (2002). limma: linear models for microarray and RNA-Seq data user’s guide. Bioinformatics Division, The Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia. Chen, Y., Lun, A. T., & Smyth, G. K. (2016). From reads to genes to pathways: differential expression analysis of RNA-Seq experiments using Rsubread and the edgeR quasi-likelihood pipeline. F1000Research, 5. "],["differential-gene-expression-analysis-with-limma-voom.html", "5 Differential Gene Expression analysis with limma-voom 5.1 NB-based DGE methods? 5.2 limma-voom pipeline 5.3 DE visualization References", " 5 Differential Gene Expression analysis with limma-voom Instructor: Daianna González Padilla In this chapter you’ll learn how DGE analysis is performed under the empirical Bayes framework of the popular limma-voom pipeline, highlighting key assumptions and concepts, and main differences with other methodologies. 5.1 NB-based DGE methods? An initial central point of discussion around DGE method development is how to model the distribution of the reads. Many methods model the read counts (\\(y_{k,ij}\\), non-negative integers) of a gene \\(i\\) in the \\(j\\) samples of condition \\(k\\) through the Poisson or the Negative Binomial (NB) distribution. Of these, NB is often preferred as it allows the mean (\\(\\mu\\)) and the variance (\\(\\sigma\\)) of the reads to be different, compared to the Poisson distribution where \\(\\mu\\)=\\(\\sigma\\). This is of particular importance as controlling the variance allows to account for variability in the gene expression levels across biological samples [1]. Figure 1: NB-distributed read counts. Modeling of read counts for gene \\(i\\) in the samples of the first and second conditions based on the NB model. Modified from Li, W. V., & Li, J. J. (2018). Estimating the NB distribution parameters is necessary to assess DE of each gene \\(i\\) between any two conditions \\(k=1,2\\) (Figure 1). Bayesian models are used defining prior distributions and relationships of such parameters. Briefly, after 1) estimating gene-wise NB parameters, 2) the mean-variance relationship across all genes can be used to shrink the gene variance estimations borrowing information from all genes or incorporating prior knowledge, something advantageous when sample sizes are small. 3) A statistical test is used to assess for each gene \\(i\\) if its true expression in the first and second condition (\\(\\theta_{1i}\\) and \\(\\theta_{2i}\\)) is the same (null hypothesis) or differs (alternative hypothesis): \\(H_0: \\theta_{1i}=\\theta_{2i}\\) \\(H_1: \\theta_{1i}≠\\theta_{2i}\\), where the \\(\\theta_{i}\\)’s are parameters included in the mean of the NB distributions (\\(\\mu\\)). 4) The test statistic is computed for each gene and 5) its associated p-value is calculated based on the null distribution. 6) Finally, p-values are corrected for multiple-testing and DEGs are determined based on an adjusted p-values cutoff [1]. Examples of popular methods based on the NB distribution are edgeR and DESeq2. Nevertheless, one limitation NB-based methods face is that they set dispersion of the data as a known and global parameter, ignoring observation-specific variation and importantly, there’s a reduced number of statistical methods for count distributions compared to the normal distribution [1,2]. Here, we’ll focus on limma that does not rely on a certain distribution but rather works on \\(log_2(cpm)\\) (CPM: counts per million) and fits linear models for DGE enabling the incorporation of additional predictors to model gene expression, a feature specially valuable for complex experimental settings. 5.2 limma-voom pipeline limma is a package for the analysis of gene expression data arising from microarray or RNA-seq technologies. It has features that make the analyses stable even for experiments with small number of arrays or samples —this is achieved by borrowing information across genes. It is specially designed for analyzing complex experiments with a variety of experimental conditions and predictors [3]. Usually, limma DGE analysis is carried out in five main steps, the last four of them completed by limma R functions, as described below. We’ll use bulk RNA-seq data from the smokingMouse package to exemplify these steps. ## Load the container package for RSE library("SummarizedExperiment") ## Connect to ExperimentHub library("ExperimentHub") eh <- ExperimentHub::ExperimentHub() ## Load package datasets myfiles <- query(eh, "smokingMouse") ## Download the mouse gene data rse_gene <- myfiles[["EH8313"]] ## Samples from the nicotine experiment and from pups only rse_gene_nic <- rse_gene[, which(rse_gene$Expt == "Nicotine" & rse_gene$Age == "Pup")] ## Retain only expressed genes (passed the filtering step) rse_gene_filt <- rse_gene_nic[ rowData(rse_gene_nic)$retained_after_feature_filtering, ] Let’s explore a little the data. ## Data dimensions: number of genes and samples dim(rse_gene_filt) #> [1] 19974 42 ## Raw counts for first 3 genes in the first 5 samples assays(rse_gene_filt)$counts[1:3, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> ENSMUSG00000051951.5 2652 2107 1978 2691 1833 #> ENSMUSG00000102331.1 15 15 9 15 13 #> ENSMUSG00000025900.13 10 7 28 11 8 ## Log-normalized counts for first 3 genes in the first 5 samples assays(rse_gene_filt)$logcounts[1:3, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> ENSMUSG00000051951.5 5.639967 5.953457 5.4923034 5.903313 5.800879 #> ENSMUSG00000102331.1 -1.747878 -1.130265 -2.1809593 -1.517393 -1.282590 #> ENSMUSG00000025900.13 -2.295096 -2.173926 -0.6153596 -1.941338 -1.948814 ## Data for the first 2 samples head(colData(rse_gene_filt), 2) #> DataFrame with 2 rows and 71 columns #> SAMPLE_ID FQCbasicStats perBaseQual perTileQual perSeqQual perBaseContent GCcontent Ncontent #> SeqLengthDist SeqDuplication OverrepSeqs AdapterContent KmerContent SeqLength_R1 percentGC_R1 phred15-19_R1 #> phred65-69_R1 phred115-119_R1 phred150-151_R1 phredGT30_R1 phredGT35_R1 Adapter65-69_R1 Adapter100-104_R1 #> Adapter140_R1 SeqLength_R2 percentGC_R2 phred15-19_R2 phred65-69_R2 phred115-119_R2 phred150-151_R2 phredGT30_R2 #> phredGT35_R2 Adapter65-69_R2 Adapter100-104_R2 Adapter140_R2 ERCCsumLogErr bamFile trimmed numReads #> numMapped numUnmapped overallMapRate concordMapRate totalMapped mitoMapped mitoRate totalAssignedGene rRNA_rate #> Tissue Age Sex Expt Group Pregnant plate location concentration #> medium date Pregnancy flowcell sum detected subsets_Mito_sum subsets_Mito_detected #> subsets_Mito_percent subsets_Ribo_sum subsets_Ribo_detected subsets_Ribo_percent retained_after_QC_sample_filtering #> retained_after_manual_sample_filtering #> [ reached getOption("max.print") -- omitted 3 rows ] 📝 Exercise 1: in order for you to perform a DGE analysis, locate your own RNA-seq datasets if you have any, or download expression data from a study of your interest and build a RSE object using recount3 (see Chapter 3: recount3 introduction). A third option you have is to download gene expression data from the smokingMouse package used here. A fourth option is to download data from GEO as Sean Davis will explain next. We’ll have more time tomorrow for doing this exercise with data of your choosing. 5.2.1 model.matrix() limma fits a linear model to the expression data of each gene (response variable), modeling the systematic part of the data by sample-level covariates (predictors). p.exercise { background-color: #FFFAFA; padding: 15px; border: 2px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.info { background-color: #FFFFF0; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.conclusion { background-color: #EEE9E9; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.question{ background-color: #E3E3E3; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 13px; font-family: sans-serif; } p.comment { background-color: #F0F0F0; padding: 20px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.alert { background-color: #FFE4E1; padding: 14px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } p.success { background-color: #E0EEE0; padding: 14px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } 💡 A model is a specification of how a set of variables relate to each other. In the case of a linear model, it is a linear equation that describes how the dependent or response variable is explained by the independent variables, also called predictors. A regression analysis with more than one independent variable is called multiple regression. Regression with only one independent variable is called simple regression [4]. The limma model is specified with a design matrix, also known as model matrix or regressor matrix, often denoted by \\(X\\). This is a matrix of values for explanatory variables of the samples: rows correspond to samples and columns to sample variables. Say that the values the \\(i\\)th sample take in the \\(h\\) covariates are \\(X_{ih}\\)’s and their coefficients are \\(\\beta_{h}\\)’s. The predicted expression of a gene in the \\(i\\)th sample is given by \\(\\hat y_i =\\beta_0 + \\sum_{1}^h\\beta_{h}X_{ih}\\). \\[ \\hat y = X\\beta=\\displaystyle {\\begin{bmatrix} \\hat y_{1}\\\\ \\hat y _{2}\\\\ \\hat y_{3}\\\\...\\\\ \\hat y_{n-1}\\\\ \\hat y_{n}\\end{bmatrix}}={\\begin{bmatrix}1&X_{11}&X_{12}&X_{13}&\\cdots&X_{1,h-1}&X_{1h}\\\\1&X_{21}&X_{22}&X_{23}&\\cdots&X_{2,h-1}&X_{2h}\\\\1&X_{31}&X_{32}&X_{33}&\\cdots&X_{3,h-1}&X_{3h} \\\\ \\vdots & \\vdots & \\vdots & \\vdots & \\ddots & \\vdots & \\vdots \\\\1&X_{n-1,1}&X_{n-1,2}&X_{n-1,3}&\\cdots&X_{n-1,h-1}&X_{n-1,h} \\\\1&X_{n,1}&X_{n,2}&X_{n,3}&\\cdots&X_{n,h-1}&X_{n,h} \\end{bmatrix}}{\\begin{bmatrix}\\beta _{0}\\\\\\beta _{1}\\\\\\beta _{2}\\\\\\beta_{3}\\\\...\\\\\\beta_{h-1}\\\\\\beta_{h}\\end{bmatrix}} \\] where \\(n\\) is the number of samples. In the first step we create this matrix using model.matrix() that receives a formula with the variables to include in the models and the sample data. ## Define formula formula <- ~ Group + Sex + flowcell + mitoRate + overallMapRate + totalAssignedGene + detected + ERCCsumLogErr ## Model matrix model <- model.matrix(formula, data = colData(rse_gene_filt)) head(model) #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX flowcellHKCTMDSXX mitoRate overallMapRate #> 1 1 0 0 0 1 0 0.03876995 0.9811 #> 2 1 1 0 0 1 0 0.03337699 0.9791 #> 3 1 0 1 0 1 0 0.03606147 0.9825 #> 4 1 1 1 1 0 0 0.03962591 0.9855 #> totalAssignedGene detected ERCCsumLogErr #> 1 0.7715862 26545 -67.33211 #> 2 0.7778447 24545 -66.38868 #> 3 0.7870034 25640 -58.89350 #> 4 0.7786461 25905 -84.91929 #> [ reached getOption("max.print") -- omitted 2 rows ] ❓ Which variables to include as covariates in the models? A straightforward strategy is to keep the model as simple as possible and after fitting the model evaluate the comparisons of interest [3]. In Chapter 7 we will discuss how correlation and variance partition analyses can help us to set up the best models. ⚠️ Very important: always check which condition group is set as the reference in you model for the coefficient/contrast of interest (column named as [Coefficient_name][Reference_Group]; corresponding reference group set to 1) as this determines if a DEG is up or downregulated in the given condition compared to the other. ## Comparison of interest: Group coef <- "GroupExperimental" 📝 Exercise 2: identify the sample data of your study and create the respective design matrix. Which is the reference group for your main variable of interest? Tomorrow we will learn how to use ExploreModelMatrix for helping us interpret coefficients. 5.2.2 voom() Compared to NB-based methods, limma works with \\(log2(cpm)\\) which are approximately normally distributed (as we have seen) and thus, opens the possibility to leverage a wide range of normal-based statistical tools not available for count distributions, including methods developed for microarray data. However, limma doesn’t assume nor require data to follow a normal distribution, but it does apply normal-based microarray-like statistical methods to RNA-seq read counts [2]. “… limma does not make any assumption that the data appears normal in a histogram.” - Gordon Smyth, author of limma, in the Bioconductor support website 2021. The benefit of using \\(log2(cpm)\\), however, is not immediate. One limitation for the direct application of normal-based methods to log-counts is that reads counts have unequal variabilities even after a log-transformation depending on the count sizes: probability distributions for counts are naturally heteroscedastic, with log-cpm not having constant variances (larger variances for larger counts) [2]. It has been proposed that to design powerful statistical analysis for RNA-seq, it is more important to model the relationship between the mean and the variance in the data than to specify which probabilistic distribution to use for the counts [2]. And importantly, converting count data taking such relationship into account does open up access to their analysis with normal-based methods. That’s why we use voom(). What voom() does is: First, to compute log-cpm. Log-normalized expression for gene \\(g\\) in sample \\(i\\) (\\(y_{gi}\\)) is given by \\[ y_{gi}=log_2(\\frac{r_{gi} + 0.5}{R_i + 1.0} \\times 10^6) \\] where \\(r_{gi}\\) is the raw count for the gene in the sample and \\(R_i\\) the library size of the sample. We add +0.5 to the counts to avoid log of zero and +1 to the library size to ensure that \\(\\frac{r_{gi}+0.5}{R_i+1}\\) is strictly less than 1 (if \\(r_{gi} = R_i\\)). A linear model is fitted to gene log-cpm values by ordinary least squares as: \\[ E(y_{gi})=\\mu_{gi}=X_i\\beta_g \\] where \\(E(y_{gi})\\) is the expected expression of gene \\(g\\) in sample \\(i\\), \\(X_i\\) is the vector with the sample values for the covariates and \\(\\beta_g\\) the vector of covariate coefficients for the gene. As a result, we have the estimated \\(\\hat\\beta_g\\), the fitted log-cpm’s \\(\\hat\\mu_{gi}=X_i\\hat\\beta_g\\) and the residual standard deviations \\(s_g\\). Then it estimates the mean-variance trend of the data by fitting a smooth curve to the \\(\\sqrt s_g\\) of the genes presented as a function of the average gene expression (in log-counts, not log-cpm). The \\(\\sqrt s_g\\)’s are used because they are symmetrically distributed. Log-counts typically show a decreasing mean-variance trend. voom() then predicts the standard deviation of each individual normalized observation \\(y_{gi}\\) (limma-trend does that at the gene level) using this trend curve: the fitted log-count of each observation is mapped to the curve and its \\(\\sqrt s_{gi}\\)value is obtained. The observation weights are \\(w_{gi}=\\frac{1}{s_{gi}^2}\\). Figure 2: voom() procedure to estimate observation-level variance weights for limma. Extracted from the original voom publication ( Law, C. W. et al. 2018). Log-cpm (\\(y_{gi}\\)) and associated weights (\\(w_{gi}\\)) can then be entered into the limma framework for linear modeling. These weights are used in the linear modeling to adjust for count heteroscedasticity [2]. library("limma") ## voom(): # 1. Transform counts to log2(cpm) # ---------------------------------------------------------------------------- # . | Note we passed voom() raw counts as input, not the lognorm counts!!! | # ---------------------------------------------------------------------------- # 2. Estimate mean-variance relationship for each gene # 3. Compute observation weights for limma (next step) vGene <- voom(assay(rse_gene_filt), design = model, plot = TRUE) Let’s explore the outpus of this function. ## Returned data names(vGene) #> [1] "E" "weights" "design" "targets" ## E: contains the computed log(cpm) dim(vGene$E) #> [1] 19974 42 vGene$E[1:5, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> ENSMUSG00000051951.5 5.906572 6.1425731 5.7434780 6.133741 6.061250 #> ENSMUSG00000102331.1 -1.512368 -0.9445475 -1.9587859 -1.306258 -1.024247 #> ENSMUSG00000025900.13 -2.074247 -1.9918532 -0.3738234 -1.736892 -1.691672 #> ENSMUSG00000025902.13 1.446325 1.2611275 1.3707154 1.419026 1.688471 #> ENSMUSG00000098104.1 1.572354 1.2408075 1.4727667 1.404882 1.533748 ## weights: contains the computed variance weight for each observation dim(vGene$weights) #> [1] 19974 42 vGene$weights[1:5, 1:5] #> [,1] [,2] [,3] [,4] [,5] #> [1,] 143.326885 117.323375 139.214140 141.247546 128.818305 #> [2,] 4.255525 4.277395 2.698902 5.113520 3.377285 #> [3,] 4.009671 3.341317 5.555186 4.020098 2.546810 #> [4,] 20.584769 15.108579 15.521441 19.219652 16.893714 #> [5,] 22.473314 16.369739 18.359068 17.691839 14.325510 ## design: is the provided design matrix head(vGene$design) #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX flowcellHKCTMDSXX mitoRate overallMapRate #> 1 1 0 0 0 1 0 0.03876995 0.9811 #> 2 1 1 0 0 1 0 0.03337699 0.9791 #> 3 1 0 1 0 1 0 0.03606147 0.9825 #> 4 1 1 1 1 0 0 0.03962591 0.9855 #> totalAssignedGene detected ERCCsumLogErr #> 1 0.7715862 26545 -67.33211 #> 2 0.7778447 24545 -66.38868 #> 3 0.7870034 25640 -58.89350 #> 4 0.7786461 25905 -84.91929 #> [ reached getOption("max.print") -- omitted 2 rows ] ## targets: the sample library sizes used to compute log(cpm) in the first step dim(vGene$targets) #> [1] 42 1 head(vGene$targets) #> lib.size #> 1 44218086 #> 2 29831069 #> 3 36929795 #> 4 38331383 #> 5 27457620 #> 6 27113922 identical(vGene$targets$lib.size, colSums(assay(rse_gene_filt))) #> [1] TRUE ➡️ In summary, voom() estimates non-parametrically the global mean-variance trend of the count data based on the expression of the genes and uses that to predict the variance of each individual expression observation (each log-cpm value) based on their predicted count sizes. The predicted variances are then associated as inverse weights to each observation that when used in linear modeling eliminate the log-cpm mean-variance trend [2]. 👉🏼 Advantages: ✅ voom() estimates the mean-variance relationship in a non-parametric way. “The parametric advantages of the Poisson or NB distributions are mitigated by the fact that the observed mean-variance relationship of RNA-seq data does not perfectly match the theoretical mean-variance relationships inherent in these distributions. While the quadratic mean-variance relationship of the NB distribution captures most of the mean-variance trend, the NB dispersion still shows a non-ignorable trend with gene abundance.” [2] ✅ Since voom() is a method to adapt count data to normal models, these give access to tractable empirical Bayes distribution theory. ✅ The use of normal distribution approaches and variance modeling is supported by generalized linear model theory. 📝 Exercise 3: compute the \\(log2(cpm)\\) and the residual variance weights for each observation in your data using voom(). 5.2.3 lmFit() This limma function fits a multiple linear model to the expression of each gene by weighted or generalized least squares to estimate the coefficients of the sample covariates which correspond to the logFC’s comparing gene expression between sample groups. Ordinary least squares (OLS) This is used to estimate the coefficients of a linear regression by minimizing the residual sum of squares [5]. Figure 3: Graphical representation of the OLS method for simple regression analysis. Source: Gulve, A. (2020). Ordinary Least Square (OLS) Method for Linear Regression. For simplicity, let’s work with one gene and say we have \\(n\\) samples. The fitted expression of the gene in the \\(j\\)th sample is \\(\\hat y_j =\\beta_{0} + \\sum_{1}^h\\beta_{h}X_{jh}\\) , where \\(\\beta_h\\) is the coefficient for the \\(h\\)th covariate and \\(X_{jh}\\) the value the \\(j\\)th sample takes for the \\(h\\)th covariate. It can also be written as \\(\\hat y_j =\\sum_{0}^h\\beta_{h}X_{jh}\\) if \\(X_{j0}=1\\). So we have an overdetermined system of \\(n\\) linear equations and \\(h\\) unknown parameters with \\(n>h\\): \\(\\hat y_j =\\sum_{0}^h\\beta_{h}X_{jh}\\) with \\(j=(1,2, ..., n)\\). Such system usually has no exact solution, so we need to estimate the coefficients that better fit the data in a linear regression. The problem is reduced to solving a quadratic minimization problem: \\(\\hat \\beta=arg \\ _\\beta\\ min \\ \\ S(\\beta)\\) where \\(S(\\beta)=\\sum_j(y_j -\\hat y_j)^2=RSS\\) (residual sum of squares). 💡 We can think of these \\(\\beta\\)’s as differences in the fitted (expected) expression of a gene. Say we have two binary categorical variables in the model (\\(X_1\\) and \\(X_2\\)), then the expected gene expression in a sample is \\(E(y|X_1, X_2) =\\hat y =\\beta_{0} + \\beta_1X_1+\\beta_2X_2\\), where \\(X_1\\) and \\(X_2\\) equal to 1 or 0. Then we have the following 4 combinations: \\(E(y|X_1=1, X_2=1) = \\mu_{12}=\\beta_{0} + \\beta_1+\\beta_2\\) \\(E(y|X_1=1, X_2=0) =\\mu_{1}=\\beta_{0} + \\beta_1\\) \\(E(y|X_1=0, X_2=1) =\\mu_{2}=\\beta_{0} + \\beta_2\\) \\(E(y|X_1=0, X_2=0) =\\mu_{0}=\\beta_{0}\\) So \\(\\beta_1=\\) \\(\\mu_1-\\mu_0\\) \\(=\\) \\(\\mu_{12}-\\mu_2\\) and \\(\\beta_2=\\) \\(\\mu_2\\)\\(-\\) \\(\\mu_0\\). Say our variable of interest is \\(\\beta_1\\). Then what we are testing is if the expected expression of a gene is different when \\(X_1=1\\) (in the first sample group) and \\(X_1=0\\) (in the second sample group), fixing \\(X_2\\) in either 1 or 0. Generalized least squares (GLS) Is a generalization of OLS that allows for heteroskedasticity and correlation between the residuals [6]. Weighted least squares (WLS) In this case the function to be minimized becomes the weighted sum of the squared residuals: squared residuals are weighted by the reciprocal of their variance so that more noisy observations have less weight. That’s what we used voom() for. lmFit() returns a fitted model object with the estimated coefficients, standard errors (\\(SE=sd/\\sqrt n\\)) and residual standard errors/deviations (\\(RSE=s_g=\\sqrt {RSS/ n-2}\\)) for each gene. Depending on the arguments and correlations in the data, this function calls one of the following functions to fit a linear model for each gene [7]: mrlm: for a robust regression if method=\"robust”. gls.series: GLS estimator if method=\"ls” and a correlation structure has been specified. lm.series: OLS method if method=\"ls” and there is no correlation structure. For the weights argument of lmFit(), the precision weights for the observations previously computed are extracted from the voom() output. ## lmFit(): # 1. Fit linear model for each gene to estimate logFCs fitGene <- lmFit(vGene) ## Corroborate "ls" method was applied fitGene$method #> [1] "ls" ## Explore outputs: estimated coefficients (logFCs) head(fitGene$coefficients) #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX flowcellHKCTMDSXX #> ENSMUSG00000051951.5 -35.637900 -0.05125195 0.05690091 -0.47469588 -0.38545404 -0.66545820 #> ENSMUSG00000102331.1 37.943310 0.72450620 0.19887963 -0.20803712 -0.40926270 -0.10900553 #> ENSMUSG00000025900.13 -43.586603 0.17256694 0.28895786 -0.04476551 0.15257245 -0.06949759 #> ENSMUSG00000025902.13 5.657837 -0.05025788 -0.04808144 -0.18732331 -0.26041436 0.07364071 #> mitoRate overallMapRate totalAssignedGene detected ERCCsumLogErr #> ENSMUSG00000051951.5 -11.487040 37.443647 4.753830 6.442499e-05 -0.0043851842 #> ENSMUSG00000102331.1 -21.871815 27.323603 -79.223111 -1.026687e-04 0.0166190950 #> ENSMUSG00000025900.13 16.727251 7.179484 42.367908 8.535604e-05 0.0133260060 #> ENSMUSG00000025902.13 -24.538883 11.174017 -18.548282 -1.231546e-05 -0.0092194951 #> [ reached getOption("max.print") -- omitted 2 rows ] 💡 Interaction terms in linear models There may be cases where we want to assess gene expression differences between 2 conditions within more than one specific group; for example if we were interested in knowing what are the effects of a treatment (\\(X_1=1\\) for treatment and 0 for controls) in females and males separately (\\(X_2=1\\) for females and 0 for males). In such cases we can fit an interaction model in which we include the product of \\(X_1\\) and \\(X_2\\) so that \\(X_1X_2=1\\) if a sample comes from a female that was treated and 0 otherwise: \\[E(y|X_1, X_2) =\\beta_{0} + \\beta_1X_1+\\beta_2X_2 + \\beta_3X_1X_2\\] \\(E(y|X_1=1, X_2=1) =\\mu_{12} =\\beta_{0} + \\beta_1+\\beta_2+\\beta_3\\) \\(E(y|X_1=1, X_2=0) =\\mu_{1} =\\beta_{0} + \\beta_1\\) \\(E(y|X_1=0, X_2=1) =\\mu_{2} =\\beta_{0} + \\beta_2\\) \\(E(y|X_1=0, X_2=0) =\\mu_{0} =\\beta_{0}\\) \\(\\beta_1 + \\beta_3=\\) \\(\\mu_{12}-\\mu_2\\) which is the expression difference between treated and control female samples (\\(X_2=1\\)) and \\(\\beta_1 =\\) \\(\\mu_{1}-\\mu_0\\) for male samples (\\(X_2=0\\)). Finally \\(\\beta_3\\), called the interaction term, is (\\(\\mu_{12}-\\mu_2\\))\\(-\\)(\\(\\mu_1-\\mu_0\\)), described as the difference in gene expression changes driven by the treatment in females compared to males [8]. 📝 Exercise 4: fit a linear regression model to the expression data of your genes and extract the coefficients for the included covariates. 5.2.4 eBayes() Next, we want to assess if the differences in gene expression between the sample groups are statistically significant. Initially, we can think of comparing the mean expression of a gene in the sample groups (e.g. cases and controls) which can be handled applying a two-sample t-test assuming that the values in both groups have an approximately normal distribution. Here we use the t-score (t-stats) to define if the difference in the means is statistically significant based on a t-distribution. The t-stats is given by: \\[ t=\\frac{\\bar x_1 - \\bar x_2}{\\sqrt{\\frac{s_1^2}{n_1}+\\frac{s_2^2}{n_2}}} \\] where \\(\\bar x_1\\) and \\(\\bar x_2\\) are the means of the expression values of a gene in the first and second sample groups, \\(s_1\\) and \\(s_2\\) are the sample standard deviations of gene expression in the same groups, and \\(n_1\\), \\(n_2\\) the corresponding sample group sizes: \\(s_1 = \\sqrt{\\frac{\\sum_{i=1}^ {n_1} (x_i-\\bar x_1)^2}{n_1-1}}\\) and \\(s_2 = \\sqrt{\\frac{\\sum_{j=1}^ {n_2} (x_j-\\bar x_2)^2}{n_2-1}}\\), with \\(x_i\\) and \\(x_j\\) the gene expression values in the samples of group 1 and 2, respectively. ➡️ Note that we say sample means and sample standard deviations because they are estimators of the population parameters, computed based on the data that we have. We can think of this t-stats as a ratio of signal and noise. The numerator contains the difference between the two means, taken as the signal for DE. The denominator corresponds to the standard error and represents the noise in terms of gene expression variance within the sample groups. This represents how spread out the signal is [9]. In that way, the t-stats is a measure of how strong is the DE signal. Once computed, the t-stats have an associated p-value based on a Student t-distribution under the null hypothesis (\\(H_o\\): \\(\\bar x_1 - \\bar x_2=0\\)). This is exactly what we can get using lm(): ## Lognorm expression of first gene rse_gene_one_gene <- rse_gene_filt[1, ] colData(rse_gene_one_gene) <- cbind(colData(rse_gene_one_gene), "lognorm_expr" = assays(rse_gene_one_gene)$logcounts[1, ] ) ## Fit simple linear model formula <- lognorm_expr ~ Group lm <- lm(formula, data = colData(rse_gene_one_gene)) summary(lm) #> #> Call: #> lm(formula = formula, data = colData(rse_gene_one_gene)) #> #> Residuals: #> Min 1Q Median 3Q Max #> -1.05368 -0.06304 0.03012 0.10254 0.24844 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 5.75377 0.04502 127.800 <2e-16 *** #> GroupExperimental -0.04292 0.06694 -0.641 0.525 #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> Residual standard error: 0.2159 on 40 degrees of freedom #> Multiple R-squared: 0.01017, Adjusted R-squared: -0.01457 #> F-statistic: 0.4111 on 1 and 40 DF, p-value: 0.525 ## Two sample t-test t.test(formula, data = colData(rse_gene_one_gene), var.equal = TRUE) #> #> Two Sample t-test #> #> data: lognorm_expr by Group #> t = 0.64121, df = 40, p-value = 0.525 #> alternative hypothesis: true difference in means between group Control and group Experimental is not equal to 0 #> 95 percent confidence interval: #> -0.09236465 0.17820636 #> sample estimates: #> mean in group Control mean in group Experimental #> 5.753765 5.710845 💡 Sample sizes are critical! Larger sample sizes increase the power of the tests and reduce the false discovery rate (FDR) as they decrease the denominator of the t-stats (increasing their values) and slight differences can then be detected. ⚠️ Now consider that for genes with small variances in their expression the t-stats will be greater and we could be detecting non-DEGs as DE (false positives). But two things must be considered at least when working with gene expression data: The first is that expression values are usually not normally distributed. Second, the distributions and variances of expression values vary across genes and conditions. ➡️ With that in mind, inference at the individual gene level can be addressed borrowing information from all the genes in the experiment through a Bayes or empirical Bayes method that produces more powerful tests. The idea of Bayesian statistics is to give unknown quantities a prior distribution, considering each feature as a member of a population of features such as genes. More specifically, empirical Bayes methods are procedures for statistical inference in which the (empirical) prior distribution is estimated from the population of all features (from the data) [8]; in standard Bayesian methods this prior distribution is fixed before observing any data [10]. Inspired by the work of Lönnstedt and Speed (2002) in which a simple expression for the posterior odds of differential expression for each gene was computed using a parametric empirical Bayes approach, Smyth, G. K. (2004) generalized this model for its application to experiments with any numbers of samples and conditions and reformulated the posterior odds statistic in terms of a moderated t-statistic in which the posterior residual standard deviations are used instead of the ordinary ones, eliminating the requirement of knowing the non-null prior guess for the proportion of differentially expressed genes required in the log-odds [11]. Let’s see how it proceeds. First, for each gene \\(g\\) we have a vector with the expression values in the \\(n\\) samples: \\[ y_{g}= (y_{g1}, y_{g2}, ..., y_{gn}) \\] We already know that the expected (predicted) gene expression in the samples is \\(E(y_{g})=X\\alpha_g\\) with \\(X\\) the design matrix and \\(\\alpha_g\\) the vector of the coefficients for the \\(h\\) covariates in the model; of these the ones of biological interest are the \\(\\beta_g\\)’s (contrasts of interest). Then, as previously described, a linear model is fitted to the expression data for each gene to obtain the coefficient estimators (\\(\\hat \\alpha_g\\)) (as well as \\(\\hat \\beta_g\\)), the residual sample variances (\\(s_g^2\\)) as estimators of the (true but unknown) residual variances (\\(\\sigma_g^2\\)), and the estimated covariance matrices. Two relevant considerations here: The expression values are not necessarily assumed to be normally distributed. The linear model is not assumed to be necessarily by least squares. However, there are two assumptions: The contrast estimators \\(\\hat \\beta_g\\) are assumed to be approximately normally distributed with mean \\(\\beta_g\\). The residual sample variances (\\(s_g^2\\)) are assumed to follow approximately a scaled chisquare distribution. Under such assumptions the ordinary t-stats for the covariate \\(j\\) in the gene \\(g\\) is defined by: \\[ t_{gj}=\\frac{\\hat \\beta_{gj}}{s_g u_{gj}}=\\frac{\\hat \\beta_{gj}}{SE(\\hat \\beta_{gj})} \\] with \\(s_g\\) the residual sample standard deviation of the gene and \\(u_{gj}\\) the unscaled standard deviation. \\(SE\\) stands for standard error. The key step in the empirical Bayes approach of limma is to leverage the information across all genes by defining prior distributions for the unknown coefficients \\(\\beta_{gj}\\) and residual variance \\(\\sigma_g^2\\) of the genes. For \\(\\beta_{gj}\\) it models the prior distribution of the coefficients that are not zero, i.e. is the expected distribution of the logFC of the genes that are DE, is given by: \\[\\beta_{gj}|\\sigma_{g}^2, \\beta_{gj}≠0 \\sim N(0, v_{0j}\\sigma_g^2)\\] “Saying that the betas have prior information centered around zero implies that we are ignorant of the sign (+/-) of the beta.” – Vincent Carey (personal communication) For the residual variances what limma does is to take the residual sample variances of all genes (\\(s_g^2\\)’s) and estimate the empirical parameters of the gamma distribution it is assumed that they follow. Specifically, \\(\\frac{1}{\\sigma_g^2}\\) is modeled by a scaled chisquare (gamma) prior distribution with mean \\(\\frac{1}{s_0^2}\\) and \\(d_0\\) degrees of freedom, describing how the residual variances are expected to vary across genes: \\[\\frac{1}{\\sigma_g^2} \\sim \\frac{1}{d_0s_0^2}\\chi_{d_0}^2\\] What we want to do next is not only to take the mean of the residual variances in the distribution (prior mean\\(\\frac{1}{s_0^2}\\)) but to estimate each gene residual variance as a Bayes predictor: as a weighted average of the prior mean (\\(\\frac{1}{s_0^2}\\)) and the observed sample variance (\\(s_g^2\\)) of each gene. This is called the moderated variance and what is graphically happening is that we are pulling the observed gene variances towards the prior mean variance: large variances are reduced and the |t-stats| increases (more powerful t-test for those genes) and small variances are increased, decreasing the |t-stats| and the power of the t-test. Under this model the posterior residual sample variance or posterior residual variance estimator (\\(\\tilde s_g^2\\)) is: \\[ \\tilde s_g^2=E(\\sigma_g^2|s_g^2)=\\frac{d_0s_0^2 + d_gs_g^2}{d_0+d_g} \\] Moderation is somehow like having larger sample sizes for the estimation of variance given that the moderated variances are (on average) closer to the population variance than the original sample variances. The moderated t-statistic can be now defined in terms of this posterior residual sample standard deviations instead of the usual ones: \\[ \\tilde t_{gj}=\\frac{\\hat \\beta_{gj}}{\\tilde s_g u_{gj}} \\] These moderated t-stats follow a t-distribution under the null hypothesis (\\(H_o:B_{gj}=0\\)) with degrees of freedom \\(d_g+d_0\\) and the associated p-values can be computed based on such distribution. As previously stated, with this redefined formula, large t-stats merely from very small \\(s_g\\)’s are avoided. This results in increased power and reduced false non-discovery rate (FNR) (non detected DEGs) and the number of DEGs obtained increases [8]. In the end we say we have moderated the residual sample standard deviations of each gene in the t-stats denominator by using the distribution of all of them across the population of genes. ✅ The approach of using the posterior values results in shrinking the gene-wise residual sample variances (\\(s_g^2\\)) towards the prior mean, making a more stable inference when a small number of samples is available. eBayes() will implement this empirical Bayes model to compute for each gene and for each contrast these moderated t-statistic and their unadjusted p-values. Additionally, it returns moderated F-statistic and log-odds of differential expression. The moderated F-statistic tests whether any of the contrasts for a gene is non-zero (\\(H_0:B_{g}=0\\)), i.e., whether that gene is differentially expressed for any contrast; it is similar to the ordinary F-statistic from analysis of variance (ANOVAR). The t-test does that for each individual contrast \\(j\\) (\\(H_0:B_{gj}=0\\)). 👉🏼 Check more about F-stats and other statistics computed by eBayes() here: https://support.bioconductor.org/p/6124/. ## eBayes() ## 1. Compute the empirical Bayes statistics for DE eBGene <- eBayes(fitGene) ## Outputs of interest: ## s2.prior -> prior residual variance (prior mean 1/s0^2) ## in prior distribution of residual variances eBGene$s2.prior #> [1] 0.78987 ## df.prior -> degrees of freedom d0 in prior distribution ## of residual variances eBGene$df.prior #> [1] 4.913248 ## s2.post -> posterior residual sample variances of the genes (~sg^2) length(eBGene$s2.post) #> [1] 19974 head(eBGene$s2.post) #> [1] 2.3397702 0.7092520 1.1613995 0.9579389 0.7390718 0.4996251 ## t -> moderated t-stats of the genes for each contrast dim(eBGene$t) #> [1] 19974 11 eBGene$t[1:5, 1:5] #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX #> ENSMUSG00000051951.5 -4.4458336 -1.0615386 1.2467597 -4.8896590 -4.2555598 #> ENSMUSG00000102331.1 1.5930925 4.5298885 1.3144392 -0.6934431 -1.3930231 #> ENSMUSG00000025900.13 -1.2896585 0.8392518 1.4887297 -0.1047634 0.3829694 #> ENSMUSG00000025902.13 0.4035855 -0.5885950 -0.5891347 -1.0922124 -1.6131630 #> ENSMUSG00000098104.1 0.7120536 -0.3614893 -1.0031002 0.6473100 1.0165548 ## p.value: corresponding unadjusted p-values of moderated t-stats dim(eBGene$p.value) #> [1] 19974 11 eBGene$p.value[1:5, 1:5] #> (Intercept) GroupExperimental SexM flowcellHKCMHDSXX flowcellHKCNKDSXX #> ENSMUSG00000051951.5 8.086618e-05 2.955320e-01 0.2205562 2.114989e-05 0.0001425797 #> ENSMUSG00000102331.1 1.199041e-01 6.283899e-05 0.1970317 4.924924e-01 0.1721763827 #> ENSMUSG00000025900.13 2.054130e-01 4.068800e-01 0.1452882 9.171465e-01 0.7039999597 #> ENSMUSG00000025902.13 6.889106e-01 5.598170e-01 0.5594589 2.820128e-01 0.1154631942 #> ENSMUSG00000098104.1 4.810321e-01 7.198519e-01 0.3225237 5.215478e-01 0.3161651149 📝 Exercise 5: obtain the moderated t-stats and associated p-values of all genes in you data for all covariates included in your model. 5.2.5 topTable() This function is also provided by limma and summarizes the results of the linear model, performs hypothesis tests and adjusts the p-values for multiple testing [12]. Among the summary statistics presented, it returns the log2FCs, moderated t-statistics, p-values, and FDR-adjusted p-values of the genes for a given contrast of interest. The default form of p-value adjustment is the Benjamini and Hochberg’s method to control the false discovery rate (FDR) which assumes independence between genes. Relevant concepts: q-value → is the FDR-adjusted p-value used to control the False Discovery Rate (FDR) that is the expected proportion of false discoveries among the discoveries (DEGs). Selecting discoveries as those being below \\(\\alpha\\) in q-value, we control FDR ≤ \\(\\alpha\\). Now we have the final statistics to determine wich genes are DE. ## topTable() ## 1. Obtain gene-wise DE stats for Group (Nicotine vs Ctrl) top_genes <- topTable(eBGene, coef = coef, p.value = 1, number = nrow(rse_gene_filt), sort.by = "none") ## Outputs for each gene and for the coeff selected (Group): ## logFC: log2-fold-changes head(top_genes$logFC) #> [1] -0.05125195 0.72450620 0.17256694 -0.05025788 -0.02726320 -0.02684710 In limma the \\(\\beta_{gj}\\)’s are the logFC’s: setdiff(top_genes$logFC, eBGene$coefficients[, "GroupExperimental"]) #> numeric(0) ## t: moderated t-stats head(top_genes$t) #> [1] -1.0615386 4.5298885 0.8392518 -0.5885950 -0.3614893 -1.0959528 ## . P.value: unadjusted p-values of t-stats head(top_genes$P.Value) #> [1] 2.955320e-01 6.283899e-05 4.068800e-01 5.598170e-01 7.198519e-01 2.803946e-01 ## adj.P.Val: p-values adjusted to control the FDR head(top_genes$adj.P.Val) #> [1] 0.53854173 0.00412576 0.63704026 0.75340755 0.85943342 0.52324928 After running all these 5 steps, one main initial plot we have to look at is the histogram of the p-values of the moderated t-stats of the genes. If there were DEGs, we’d expect to see a flat distribution of p-values corresponding to non-DEGs and a peak near p=0 for DEGs (for which we reject the null hypothesis). If this peak is absent but a uniform distribution still appears, DEGs might be detected after correcting for multiple testing. ## Histogram of unadjusted p-values hist(top_genes$P.Value, xlab = "p-values", main = "") If very different p-value distributions are obtained from the uniform one, the best we can do is trying to explore if there are specific groups of genes (e.g. lowly-expressed genes) presenting such variable p-values and revisiting the assumptions and considerations of the statistical tests implemented [13]. 📝 Exercise 6: obtain the DE logFCs, t-stats, p-values, and adjusted p-values of the genes for a given constrast/covariate under study. 5.3 DE visualization DEGs are identified defining a significance threshold (on the adjusted p-values). Let’s quantify the number of DEGs for nicotine exposure in pup brain and visualize their expression and DE statistics. ## DEGs for FDR<0.05 de_genes <- top_genes[which(top_genes$adj.P.Val < 0.05), ] ## Number of DEGs dim(de_genes) #> [1] 1895 6 5.3.1 Volcano plots A very practical and useful plot to graphically represent DEGs and visualize their expression differences between conditions is a volcano plot. This is a scatter plot of the logFC’s of the genes in the x-axis vs their adjusted p-values in a -log scale in the y-axis. library("ggplot2") ## Define up- and down-regulated DEGs, and non-DEGs FDR <- 0.05 DE <- vector() for (i in 1:dim(top_genes)[1]) { if (top_genes$adj.P.Val[i] > FDR) { DE <- append(DE, "n.s.") } else { if (top_genes$logFC[i] > 0) { DE <- append(DE, "Up") } else { DE <- append(DE, "Down") } } } top_genes$DE <- DE ## Colors, sizes and transparencies for up & down DEGs and non-DEGs cols <- c("Up" = "indianred2", "Down" = "steelblue2", "n.s." = "grey") sizes <- c("Up" = 1.3, "Down" = 1.3, "n.s." = 0.8) alphas <- c("Up" = 0.4, "Down" = 0.6, "n.s." = 0.5) ## Plot volcano plot ggplot( data = top_genes, aes( x = logFC, y = -log10(adj.P.Val), color = DE, fill = DE, size = DE, alpha = DE ) ) + geom_point(shape = 21) + geom_hline( yintercept = -log10(FDR), linetype = "dashed", color = "gray35", linewidth = 0.5 ) + geom_vline( xintercept = c(-1, 1), linetype = "dashed", color = "gray35", linewidth = 0.5 ) + labs(y = "-log10(FDR)", x = "logFC(Nicotine vs Control)") + theme_bw() + scale_color_manual(values = cols, name = "Differential expression") + scale_fill_manual(values = cols, name = "Differential expression") + scale_size_manual(values = sizes, name = "Differential expression") + scale_alpha_manual(values = alphas, name = "Differential expression") + theme( plot.margin = unit(c(1, 1, 1, 1), "cm"), legend.key.height = unit(0.15, "cm"), axis.title = element_text(size = (13)), legend.title = element_text(size = 13), legend.text = element_text(size = 12) ) 5.3.2 Heat maps Another common way to represent differential expression results is through a heat map. The package ComplexHeatmap offers a flexible toolkit to easily create heat maps with row and column annotations, a feature of particular value to plot expression data of genes across samples with multiple biological and technical differences. Although initially all genes in your data can be plotted, frequently only DEGs are included as they tend to show clearer gene expression patterns. library("ComplexHeatmap") ## We plot lognorm counts lognorm_data <- assays(rse_gene_filt)$logcounts ## Subset to DEGs only lognorm_data <- lognorm_data[rownames(de_genes), ] ## Define column (sample) names to display colnames(lognorm_data) <- paste0("Pup_", 1:dim(lognorm_data)[2]) 🗒️ Notes: It is sometimes convenient to regress out the technical variables’ contributions on gene expression to see more clearly the effects of interest. This can happen, for instance, when the logFCs are too small to see any significant differences in the plots or when there are other strong confounding factors. Functions such as cleaningY() of jaffelab can be used for this purpose. The lognorm counts have to be correctly scaled and centered (around zero) to make the differences in the expression of the genes more notorious in the heat map. A simple way to do that is substracting from each lognorm count \\(y_{gi}\\) (from the gene \\(g\\) and sample \\(i\\)) the mean expression of the gene* and dividing by the standard deviation (\\(\\sigma\\)) of the same gene expression values. This is formally called the z-score: the number of standard deviations away from the mean. \\[ z=\\frac{y_{gi} - \\frac{\\sum_{k=1}^{n}{y_{gk}}}{n}}{\\sigma}, \\] \\(n\\) is the number of samples. * This can also be done by columns (samples), not only by rows (genes). 👉🏼 For more on centering and scaling, see this video: ## Center and scale the data to make differences more evident lognorm_data <- (lognorm_data - rowMeans(lognorm_data)) / rowSds(lognorm_data) ## Sample annotation: Sex, Group, and library size col_anno <- HeatmapAnnotation( df = as.data.frame(colData(rse_gene_filt)[, c("Sex", "Group")]), library_size = anno_barplot(colData(rse_gene_filt)$sum, gp = gpar(fill = "lightyellow2")), col = list( "Sex" = c("F" = "hotpink1", "M" = "dodgerblue"), "Group" = c("Control" = "gray68", "Experimental" = "gold2") ) ) ## Gene annotation: logFC and biotype de_genes$logFC_binary <- sapply(de_genes$logFC, function(x) { if (x > 0) { ">0" } else { "<0" } }) de_genes$protein_coding_gene <- sapply(rowData(rse_gene_filt[rownames(de_genes), ])$gene_type, function(x) { if (x == "protein_coding") { "TRUE" } else { "FALSE" } }) gene_anno <- rowAnnotation( df = as.data.frame(cbind( "logFC" = de_genes$logFC_binary, "protein_coding_gene" = de_genes$protein_coding_gene )), col = list( "logFC" = c("<0" = "deepskyblue3", ">0" = "brown2"), "protein_coding_gene" = c("TRUE" = "darkseagreen3", "FALSE" = "magenta") ) ) library("circlize") ## Plot Heatmap(lognorm_data, name = "lognorm counts", show_row_names = FALSE, top_annotation = col_anno, left_annotation = gene_anno, row_km = 2, column_km = 2, col = colorRamp2(c(-4, -0.0001, 00001, 4), c("darkblue", "lightblue", "lightsalmon", "darkred")), row_title = "DEGs", column_title = "Samples", column_names_gp = gpar(fontsize = 7), heatmap_width = unit(12.5, "cm"), heatmap_height = unit(12.5, "cm") ) 📝 Exercise 7: obtain the number of DEGs you got and represent them in a volcano plot and a heat map. Include all the sample and gene information you consider relevant in the latter. References Li, W. V., & Li, J. J. (2018). Modeling and analysis of RNA‐seq data: a review from a statistical perspective. Quantitative Biology, 6(3), 195-209. Law, C. W., Chen, Y., Shi, W., & Smyth, G. K. (2014). voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome biology, 15(2), 1-17. Smyth, G. K., Ritchie, M., Thorne, N., Wettenhall, J., Shi, W., & Hu, Y. (2002). limma: linear models for microarray and RNA-Seq data user’s guide. Bioinformatics Division, The Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia. van den Berg, S. M. (2022). Analysing data using linear models. Web site: https://bookdown.org/pingapang9/linear_models_bookdown/ Wikipedia. (n.d.). Ordinary least squares. Web site: https://en.wikipedia.org/wiki/Ordinary_least_squares Taboga, Marco (2021). “Generalized least squares”, Lectures on probability theory and mathematical statistics. Kindle Direct Publishing. Online appendix. https://www.statlect.com/fundamentals-of-statistics/generalized-least-squares. Documentation for lmFit: https://rdrr.io/bioc/limma/man/lmFit.html The Pennsylvania State University. (2018). Statistical Analysis of Genomics Data. Web site: https://online.stat.psu.edu/stat555/node/36/ Tushe, M. (2021). A Simple Trick to Understand the t-test. Web site: https://miroslavtushev.medium.com/a-simple-trick-to-understand-the-t-test-2c2a9e7f1dc5 Wikipedia. (n.d.). Empirical Bayes method. Web site: https://en.wikipedia.org/wiki/Empirical_Bayes_method#:~:text=Empirical Bayes methods are procedures,before any data are observed. Smyth, G. K. (2004). Linear models and empirical bayes methods for assessing differential expression in microarray experiments. Statistical applications in genetics and molecular biology, 3(1). Documentation for topTable: https://www.rdocumentation.org/packages/limma/versions/3.28.14/topics/toptable Robinson, D. (2014). How to interpret a p-value histogram. Web site: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/ "],["interpreting-model-coefficients-with-exploremodelmatrix.html", "6 Interpreting model coefficients with ExploreModelMatrix 6.1 Model objects in R 6.2 ExploreModelMatrix 6.3 Example 1 6.4 Example 2 6.5 Example 3 6.6 Exercise 6.7 To learn more 6.8 Community", " 6 Interpreting model coefficients with ExploreModelMatrix Instructor: Leo 6.1 Model objects in R Linear regression review https://lcolladotor.github.io/bioc_team_ds/helping-others.html#linear-regression-example With R, we use the model.matrix() to build regression models using the Y ~ X1 + X2 formula syntax as exemplified below. ## ?model.matrix mat <- with(trees, model.matrix(log(Volume) ~ log(Height) + log(Girth))) mat #> (Intercept) log(Height) log(Girth) #> 1 1 4.248495 2.116256 #> 2 1 4.174387 2.151762 #> 3 1 4.143135 2.174752 #> 4 1 4.276666 2.351375 #> 5 1 4.394449 2.370244 #> 6 1 4.418841 2.379546 #> 7 1 4.189655 2.397895 #> 8 1 4.317488 2.397895 #> 9 1 4.382027 2.406945 #> 10 1 4.317488 2.415914 #> 11 1 4.369448 2.424803 #> 12 1 4.330733 2.433613 #> 13 1 4.330733 2.433613 #> 14 1 4.234107 2.459589 #> 15 1 4.317488 2.484907 #> 16 1 4.304065 2.557227 #> [ reached getOption("max.print") -- omitted 15 rows ] #> attr(,"assign") #> [1] 0 1 2 colnames(mat) #> [1] "(Intercept)" "log(Height)" "log(Girth)" How do we interpret the columns of our model matrix mat? summary(lm(log(Volume) ~ log(Height) + log(Girth), data = trees)) #> #> Call: #> lm(formula = log(Volume) ~ log(Height) + log(Girth), data = trees) #> #> Residuals: #> Min 1Q Median 3Q Max #> -0.168561 -0.048488 0.002431 0.063637 0.129223 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) -6.63162 0.79979 -8.292 5.06e-09 *** #> log(Height) 1.11712 0.20444 5.464 7.81e-06 *** #> log(Girth) 1.98265 0.07501 26.432 < 2e-16 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> Residual standard error: 0.08139 on 28 degrees of freedom #> Multiple R-squared: 0.9777, Adjusted R-squared: 0.9761 #> F-statistic: 613.2 on 2 and 28 DF, p-value: < 2.2e-16 6.2 ExploreModelMatrix It’s a Bioconductor package which is useful to understand statistical models we use in differential expression analyses. It is interactive and helps us by creating some visual aids. http://www.bioconductor.org/packages/ExploreModelMatrix For more details, check their paper https://doi.org/10.12688/f1000research.24187.2. We’ll go over the examples they provide at http://www.bioconductor.org/packages/release/bioc/vignettes/ExploreModelMatrix/inst/doc/ExploreModelMatrix.html 6.3 Example 1 ## Load ExploreModelMatrix library("ExploreModelMatrix") ## Example data (sampleData <- data.frame( genotype = rep(c("A", "B"), each = 4), treatment = rep(c("ctrl", "trt"), 4) )) #> genotype treatment #> 1 A ctrl #> 2 A trt #> 3 A ctrl #> 4 A trt #> 5 B ctrl #> 6 B trt #> 7 B ctrl #> 8 B trt ## Let's make the visual aids provided by ExploreModelMatrix vd <- ExploreModelMatrix::VisualizeDesign( sampleData = sampleData, designFormula = ~ genotype + treatment, textSizeFitted = 4 ) ## Now lets plot these images cowplot::plot_grid(plotlist = vd$plotlist) Interactively, we can run the following code: ## We are using shiny again here app <- ExploreModelMatrix( sampleData = sampleData, designFormula = ~ genotype + treatment ) if (interactive()) shiny::runApp(app) 6.4 Example 2 http://bioconductor.org/packages/release/bioc/vignettes/ExploreModelMatrix/inst/doc/ExploreModelMatrix.html#example-2 6.5 Example 3 http://bioconductor.org/packages/release/bioc/vignettes/ExploreModelMatrix/inst/doc/ExploreModelMatrix.html#example-3 6.6 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise 1: Interpret ResponseResistant.Treatmentpre from the second example. It could be useful to take a screenshot and to draw some annotations on it. Exercise 2: Whis is the 0 important at the beginning of the formula in the third example? 6.7 To learn more A guide to creating design matrices for gene expression experiments: http://bioconductor.org/packages/release/workflows/vignettes/RNAseq123/inst/doc/designmatrices.html https://f1000research.com/articles/9-1444 “Model matrix not full rank” http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#model-matrix-not-full-rank 6.8 Community Some of the ExploreModelMatrix authors: https://bsky.app/profile/csoneson.bsky.social https://twitter.com/FedeBioinfo https://twitter.com/mikelove Some of the edgeR and limma authors: https://twitter.com/mritchieau https://twitter.com/davisjmcc https://twitter.com/markrobinsonca https://twitter.com/AliciaOshlack "],["dge-model-building-with-variancepartition.html", "7 DGE model building with variancePartition 7.1 Canonical Correlation Analysis 7.2 Fit model and extract fraction of variance explained 7.3 Examine the expression of most affected genes by each sample variable References", " 7 DGE model building with variancePartition Instructor: Daianna González-Padilla After having processed RNA-seq data and assessed the quality and the variability of the samples the next step for DGE is to explore the variance in the expression of the genes themselves according to sample groups, or in other words, to quantify the contribution of the multiple sample variables in the gene expression variation. To determine which variables are the major drivers of expression variability, and importantly to define if the technical variability of RNA-seq data is low enough to study the condition of interest, we can implement an analysis of variance partition. variancePartition is a package that decomposes for each gene the expression variation into fractions of variance explained (FVE) by the sample variables in the experimental design of high-throughput genomics studies [1]. In order to exemplify how to implement this analysis and the type of conclusions that can be drawn from it, we’ll use bulk RNA-seq data from the smokingMouse package. ## Load the container package for this type of data library("SummarizedExperiment") ## Connect to ExperimentHub library("ExperimentHub") eh <- ExperimentHub::ExperimentHub() ## Load the datasets of the package myfiles <- query(eh, "smokingMouse") ## Download the mouse gene data rse_gene <- myfiles[["EH8313"]] ## Keep samples from nicotine experiment and pups only rse_gene_nic <- rse_gene[ , which(rse_gene$Expt == "Nicotine" & rse_gene$Age == "Pup") ] ## Use expressed genes only (i.e. that passed the filtering step) rse_gene_filt <- rse_gene_nic[ rowData(rse_gene_nic)$retained_after_feature_filtering, ] ## Keep samples that passed QC and manual sample filtering steps (all passed) rse_gene_filt <- rse_gene_filt[ , rse_gene_filt$retained_after_QC_sample_filtering & rse_gene_filt$retained_after_manual_sample_filtering ] 7.1 Canonical Correlation Analysis Prior to the variance partition analysis, evaluating the correlation between sample variables is crucial because highly correlated variables can produce unstable estimates of the variance fractions and impede the identification of the variables that really contribute to the expression variation. There are at least two problems with correlated variables: If two variables are highly correlated we could incorrectly determine that one of them contributes to gene expression changes when it was actually not explanatory but just correlated with a real contributory variable. The part of variance explained by a biologically relevant variable can be reduced by the apparent contributions of correlated variables, if for example, they contain very similar information (i.e. are redundant variables). Additionally, the analysis is better performed with simpler models, specially when we have a limited number of samples in the study. Hence, to remove such variables we must first identify them. We will perform a Canonical Correlation Analysis (CCA) with canCorPairs() that assesses the degree to which the variables co-vary and contain the same information. With CCA, linear combinations that maximize the correlation between variable sets are estimated. CCA is just like a normal correlation analysis between 2 vectors but it can accommodate matrices as well (variable sets). Note that CCA returns correlations values between 0 and 1 [2]. library("variancePartition") library("pheatmap") ## Plot heatmap of correlations ## Define all variables to examine; remove those with single values formula <- ~ Group + Sex + plate + flowcell + mitoRate + overallMapRate + totalAssignedGene + rRNA_rate + sum + detected + ERCCsumLogErr ## Measure correlations CCA <- canCorPairs(formula, colData(rse_gene_filt)) ## Heatmap pheatmap( CCA, ## data color = hcl.colors(50, "YlOrRd", rev = TRUE), ## color scale fontsize = 8, ## text size border_color = "black", ## border color for heatmap cells cellwidth = unit(0.4, "cm"), ## height of cells cellheight = unit(0.4, "cm") ## width of cells ) p.alert { background-color: #FFE4E1; padding: 14px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } ⚠️ Very important: always inspect if there are any correlated variables with the one of interest in your study! This is extremely important as correlated variables could represent confounding factors and/or hinder the detection of significant DE events, thus yielding to misleading results. Importantly, Group is not highly correlated with any other variable in this study, but overallMapRate is correlated with rRNA_rate, library preparation plate, and the sequencing flowcell; sum (library size) and detected (number of expressed genes) are also correlated. For a detailed definition of these variables check here. 📝 Exercise 1: Run a CCA analysis and determine which pairs of variables in your dataset are correlated. Is there any correlated variable with your variable of interest? Depending on your results there’s sometimes convenient to dig a little deeper into the relationship between correlated variables and to analyze these metrics among our control and experimental samples. Let’s work on that! library("ggplot2") library("cowplot") ## Boxplots/Scatterplots/Barplots for each pair of correlated variables corr_plots <- function(sample_var1, sample_var2, sample_color) { ## Define sample colors by variable colors <- list( "Group" = c("Control" = "brown2", "Experimental" = "deepskyblue3"), "Sex" = c("F" = "hotpink1", "M" = "dodgerblue"), "plate" = c("Plate1" = "darkorange", "Plate2" = "lightskyblue", "Plate3" = "deeppink1"), "flowcell" = c( "HKCG7DSXX" = "chartreuse2", "HKCMHDSXX" = "magenta", "HKCNKDSXX" = "turquoise3", "HKCTMDSXX" = "tomato" ) ) data <- colData(rse_gene_filt) ## a) Barplots for categorical variable vs categorical variable if (class(data[, sample_var1]) == "character" & class(data[, sample_var2]) == "character") { ## y-axis label y_label <- paste("Number of samples from each ", sample_var2, sep = "") ## Stacked barplot with counts for 2nd variable plot <- ggplot(data = as.data.frame(data), aes( x = !!rlang::sym(sample_var1), fill = !!rlang::sym(sample_var2) )) + geom_bar(position = "stack") + ## Colors by 2nd variable scale_fill_manual(values = colors[[sample_var2]]) + ## Show sample counts on stacked bars geom_text(aes(label = after_stat(count)), stat = "count", position = position_stack(vjust = 0.5), colour = "gray20", size = 3 ) + theme_bw() + labs( subtitle = paste0("Corr: ", signif(CCA[sample_var1, sample_var2], digits = 3)), y = y_label ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } ## b) Boxplots for categorical variable vs continuous variable else if (class(data[, sample_var1]) == "character" & class(data[, sample_var2]) == "numeric") { plot <- ggplot(data = as.data.frame(data), mapping = aes( x = !!rlang::sym(sample_var1), y = !!rlang::sym(sample_var2), color = !!rlang::sym(sample_var1) )) + geom_boxplot(size = 0.25, width = 0.32, color = "black", outlier.color = NA) + geom_jitter(width = 0.15, alpha = 1, size = 1.5) + stat_smooth(method = "lm", geom = "line", alpha = 0.6, size = 0.4, span = 0.3, aes(group = 1), color = "orangered3") + scale_color_manual(values = colors[[sample_var1]]) + theme_bw() + guides(color = "none") + labs( subtitle = paste0("Corr: ", signif(CCA[sample_var1, sample_var2], digits = 3)), y = gsub("_", " ", sample_var2), x = sample_var1 ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } ## c) Scatterplots for continuous variable vs continuous variable else if (class(data[, sample_var1]) == "numeric" & class(data[, sample_var2]) == "numeric") { plot <- ggplot(as.data.frame(data), aes( x = !!rlang::sym(sample_var1), y = !!rlang::sym(sample_var2), color = !!rlang::sym(sample_color) )) + geom_point(size = 2) + stat_smooth(method = "lm", geom = "line", alpha = 0.6, size = 0.6, span = 0.25, color = "orangered3") + ## Color by sample_color variable scale_color_manual(name = sample_color, values = colors[[sample_color]]) + theme_bw() + labs( subtitle = paste0("Corr: ", signif(CCA[sample_var1, sample_var2], digits = 3)), y = gsub("_", " ", sample_var2), x = gsub("_", " ", sample_var1) ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } return(plot) } As shown below, Group and plate are moderately correlated given that 14 of the 23 (60.8%) control samples and 11 of the 19 (57.9%) exposed samples were in the first and second plate for library preparation, respectively. ## Correlation plot for Group and plate p <- corr_plots("Group", "plate", NULL) p + theme(plot.margin = unit(c(1, 5.5, 1, 5.5), "cm")) We can also observe that even though QC metrics such as overallMapRate and rRNA_rate are correlated, there’s no distinction between control and exposed samples for these variables. ## Correlation plot for overallMapRate and rRNA_rate p <- corr_plots("overallMapRate", "rRNA_rate", "Group") p + theme(plot.margin = unit(c(2, 3.5, 2, 3.5), "cm")) Moreover, the correlation between overallMapRate and the library preparation plate is mainly given by the plate 1 samples that have lower rates, similar to what occurs with the samples from the first flowcell. ## Correlation plot for overallMapRate and plate p <- corr_plots("plate", "overallMapRate", NULL) p + theme(plot.margin = unit(c(2, 5, 2, 5), "cm")) ## Correlation plot for overallMapRate and flowcell p <- corr_plots("flowcell", "overallMapRate", NULL) p + theme(plot.margin = unit(c(2, 5, 2, 5), "cm")) Interestingly, control samples seem to present more expressed genes than exposed samples for a given library size, however none of these variables is correlated with Group. ## Correlation plots for sum and detected p <- corr_plots("sum", "detected", "Group") p + theme(plot.margin = unit(c(2, 3.5, 2, 3.5), "cm")) ❓ Now look at the following plot. Why is it important that experimental and control samples are distributed throughout all sequencing flowcells? p <- corr_plots("Group", "flowcell", NULL) plots <- plot_grid(p) plots + theme(plot.margin = unit(c(0.5, 5, 0.5, 5), "cm")) Hint: What would happen if all experimental samples were in one flowcell and all controls in another? After identifying which variables are correlated and exploring the metrics of control and experimental samples the next is to determine which variable from each pair of correlated variables should be discarded and which one included in the models. How do we discern which ones to keep? As recommended in the variancePartition user’s guide [2], initially we can fit a linear model to the expression data of each gene taking all sample variables and then investigate which ones explain higher percentages of variance for many genes. But first let’s review how variancePartition works. 7.2 Fit model and extract fraction of variance explained Briefly, what variancePartition does is to fit a linear model for each gene separately and to compute the fraction of the total data variance explained by each variable of the study design, as well as by the residuals, using the calcVarPart() function. These computed fractions of variation explained (FVE) summarize the contribution of each variable and naturally sum to 1 [1]. variancePartition fits two types of models: Linear mixed model (LMM) where all categorical variables are modeled as random effects and all continuous variables are fixed effects. The function lmer() from lme4 is used to fit this model. ## Fit LMM specifying the existence of random effects with '(1| )' fit <- lmer(expr ~ a + b + (1|c), data=data) Fixed effects model, which is basically the standard linear model (LM), where all variables are modeled as fixed effects. The function lm() is used to fit this model. ## Fit LM modeling all variables as fixed effects fit <- lm(expr ~ a + b + c, data=data) In our case, the function will be modeled as a mixed model since we have both effects. p.question{ background-color: #E3E3E3; padding: 20px; border: 1px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } ❓ What are random and fixed effects? Categorical variables are usually modeled as random effects, i.e., variables such as flowcell, plate, donor, etc. whose levels are “randomly chosen or selected from a larger population”. These levels are not of interest by themselves but the grouping of the samples by them. Random effects correspond to those variables whose effect on the expression of a gene varies according to its sample groups/levels. On the other hand, continuous variables can be modeled as fixed effects. These are sample-level variables that preserve their impact on the expression of a gene irrespective of the sample. ❓ Why is this effect distinction important? Because when we have clustered data, like gene expression values grouped by sample sex, batch, etc. we are violating the relevant assumption of independence, making an incorrect inference when using a general linear model (GLM). If we have clustered data where the variables’ values have distinct effects on gene expression, we must work with an extension of GLM, i.e. with the linear mixed model (LMM) that contains a mix of both fixed and random effects [3]. Linear mixed model fit 1️⃣ After fitting a linear model to the expression data of each gene we obtain the predicted expression of the genes given by the estimated coefficients of the variables: \\(\\hat y =\\sum_{1}^j\\beta_{j}X_{j} + \\sum_{1}^k\\alpha_{k}Z_{k}\\) ← the expression of a gene across all samples is given by the samples’ values in the \\(j\\) fixed effects and \\(k\\) random effects. Therefore, the gene expression in the sample \\(i\\) is given by \\(\\hat y_i =\\sum_{1}^j\\beta_{j}X_{ij} + \\sum_{1}^k\\alpha_{k}Z_{ik}\\). Then \\(y=\\hat y+\\epsilon\\), which means that the true (observed) expression value is given by the predicted value plus an error term (\\(\\epsilon\\)), also called noise or residual: \\[y =\\sum_{1}^j\\beta_{j}X_{j} + \\sum_{1}^k\\alpha_{k}Z_{k} + \\epsilon\\] \\(X_j\\) is the vector of the values of the samples in the \\(j\\)th fixed effect. \\(\\beta_j\\) is the predicted coefficient of the fixed effect \\(j\\). \\[ X_j\\beta_j= \\ \\ _{n \\ \\ samples}\\stackrel{j^{th}\\ \\ fixed\\ \\ effect }{\\begin{bmatrix} X_{1j} \\\\ ... \\\\ X_{(n-1)j} \\\\ X_{nj} \\end{bmatrix}}\\beta_j = {\\begin{bmatrix} X_{1j}\\beta_j \\\\ ... \\\\ X_{(n-1)j}\\beta_j \\\\ X_{nj} \\beta_j \\end{bmatrix}} \\] \\(Z_k\\) is the vector of values the samples have for the \\(k\\)th random effect. \\(\\alpha_k\\) is the predicted coefficient of the random effect \\(k\\). These are drawn from a normal distribution \\(∼N(0, \\sigma_{\\alpha_k}^2 )\\). \\[ Z_k\\alpha_k= \\ \\ _{n \\ \\ samples}\\stackrel{{k^{th}\\ \\ random\\ \\ effect }}{\\begin{bmatrix} Z_{1k} \\\\ ... \\\\ Z_{(n-1)k}\\\\ Z_{nk} \\end{bmatrix}}\\alpha_k = {\\begin{bmatrix} Z_{1k}\\alpha_k \\\\ ... \\\\ Z_{(n-1)k}\\alpha_k\\\\ Z_{nk}\\alpha_k \\end{bmatrix}} \\] \\(\\epsilon\\) is the noise term which is \\(y-\\hat y\\), the difference between the observed and predicted expression and is also drawn from \\(∼N(0, \\sigma_{\\epsilon}^2 )\\). Expanding, \\[ y= {\\begin{bmatrix} X_{11}\\beta_1 \\\\ ... \\\\ X_{(n-1)1}\\beta_1 \\\\ X_{n1} \\beta_1 \\end{bmatrix}} + {\\begin{bmatrix} X_{12}\\beta_2 \\\\ ... \\\\ X_{(n-1)2}\\beta_2 \\\\ X_{n2} \\beta_2 \\end{bmatrix}} + ... +{\\begin{bmatrix} X_{1j}\\beta_j \\\\ ... \\\\ X_{(n-1)j}\\beta_j \\\\ X_{nj} \\beta_j \\end{bmatrix}} + {\\begin{bmatrix} Z_{11}\\alpha_1 \\\\ ... \\\\ Z_{(n-1)1}\\alpha_1\\\\ Z_{n1}\\alpha_1 \\end{bmatrix}} +{\\begin{bmatrix} Z_{12}\\alpha_2 \\\\ ... \\\\ Z_{(n-1)2}\\alpha_2\\\\ Z_{n2}\\alpha_2 \\end{bmatrix}} \\] \\[ + ... + {\\begin{bmatrix} Z_{1k}\\alpha_k \\\\ ... \\\\ Z_{(n-1)k}\\alpha_k\\\\ Z_{nk}\\alpha_k \\end{bmatrix}} + {\\begin{bmatrix} \\epsilon_1 \\\\ ... \\\\ \\epsilon_{(n-1)} \\\\ \\epsilon_n \\end{bmatrix}} \\] All parameters are estimated with maximum likelihood, the default method in the variancePartition software when random effects are specified because it performs best in simulations. 2️⃣ Then, calcVarPart() computes for each fixed effect \\(\\sum_{i=1}^n(\\beta_{j}X_{ij}-\\bar{\\beta_{j}X_{j}})^2=var(\\beta_{j}X_{j})(n-1)\\), the squared sum of the predicted expression values of a gene in the \\(n\\) samples only taking into account the variable \\(j\\) in the regression model: \\(\\hat y = \\beta_{j}X_{j}\\). Each of these squared sums is scaled by additional factors but to simplify let’s just explain this analysis in terms of the variance (that is proportional to the squared sum): The variance explained by the \\(j\\)th fixed effect is: \\(\\sigma_{\\beta_j}^2=var(X_j{\\beta_j})\\) For random effects the variances are computed by variance component estimates with VarCorr() from nmle: The variance of the \\(k\\)th random effect is \\(\\sigma_{\\alpha_k}^2=var(Z_k{\\alpha_k})\\) The total variance of the expression values is calculated by \\(\\sum_{i=1}^n(y_i - \\bar { y})^2=var(y)(n-1)\\), where \\(y_i = \\sum_{1}^j\\beta_{j}X_{ij} + \\sum_{1}^k\\alpha_{k}Z_{ik} + \\epsilon_i\\) considering all variables in the model and the error: The total variance is: \\(var(y)= \\sigma_{Total}^2= var(X_{1}\\beta_1)+var(X_{2}\\beta_2)+...+var(X_{j}\\beta_j)+var(Z_{1}\\alpha_1)+var(Z_{2}\\alpha_2)+...+var(Z_{k}\\alpha_k)+var(\\epsilon)=\\) \\(\\sum_1^jvar(X_j\\beta_j)+\\sum_1^kvar(Z_k\\alpha_k)+var(\\epsilon)=\\) \\(\\sigma_{Total}^2=\\sum_1^j{ \\sigma_{\\beta_j}^2} + \\sum_1^k{ \\sigma_{\\alpha_k}^2} + \\sigma_{\\epsilon}^2\\) 3️⃣ Finally, it computes: The fraction of the total data variance explained by the \\(j\\)th fixed effect is \\(\\sigma_{\\beta_j}^2\\) / \\(\\sigma_{Total}^2\\) The fraction of the total data variance explained by the \\(k\\)th random effect is \\(\\sigma_{\\alpha_k}^2\\) / \\(\\sigma_{Total}^2\\) Note that \\(y=\\hat y+\\epsilon\\) because the expression can’t be completely described by a straight line, so not all the variation of \\(y\\) can be explained by the variation of the sample variables, instead \\(var(y)=var(\\hat y)+var(\\epsilon)=var(\\hat y) + \\sigma_{\\epsilon}^2\\), where \\(\\sigma_{\\epsilon}^2=\\sum_{i=1}^n(\\hat y_i -y_i)^2/n-1\\). The residual variance is \\(\\sigma_{\\epsilon}^2\\) / \\(\\sigma_{Total}^2\\) ; this is the variance that the model (with the included covariates) couldn’t explain. p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 13px; font-family: sans-serif; } 👉🏼 Source code of calcVarPart() here. Once we have reviewed what variancePartition computes and how, we can use it to quantify the FVE for each variable. ## Fit a linear mixed model (LMM) that takes continuous variables as fixed effects and categorical variables as random effects varPartAnalysis <- function(formula) { ## Ignore genes with variance 0 genes_var_zero <- which(apply(assays(rse_gene_filt)$logcounts, 1, var) == 0) if (length(genes_var_zero) > 0) { rse_gene_filt <- rse_gene_filt[-genes_var_zero, ] } ## Loop over each gene to fit the model and extract variance explained by each variable varPart <- fitExtractVarPartModel(assays(rse_gene_filt)$logcounts, formula, colData(rse_gene_filt)) # Sort variables by median fraction of variance explained (FVE) vp <- sortCols(varPart) p <- plotVarPart(vp) return(list(p, vp)) } In the following violin plots, we have the % of variance explained in the expression of each gene by each covariate, based on the model with all variables. Of our pairs of correlated variables, rRNA_rate has the highest median FVE and thus, should be included in the models for DGE, whereas variables correlated with it (overallMapRate) must be removed. Furthermore, library preparation plate must be excluded as it is correlated with Group. ##### Fit model with all variables ##### # sum, detected, and ERCCsumLogErr are not included as they are in very different scales! formula <- ~ (1 | Group) + (1 | Sex) + (1 | plate) + (1 | flowcell) + mitoRate + overallMapRate + totalAssignedGene + rRNA_rate plot <- varPartAnalysis(formula)[[1]] plot + theme( plot.margin = unit(c(1, 1, 1, 1), "cm"), axis.text.x = element_text(size = (7)), axis.text.y = element_text(size = (7.5)) ) ⚠️ Note that some variables such as the library size and the number of detected genes that are in different orders of magnitude to the rest of the QC metrics and categorical variables are not included in this analysis as they can impact the model predictions and the interpretability of the regression results [4]. These variables can be analyzed only after rescaling. After re-running the analysis without the previous correlated variables, now Group contribution increases but so does the residual source, i.e., the % of gene expression variance that the model couldn’t explain increases, although the increase is rather low. This occurs because when we remove independent variables to a regression equation, we can explain less of the variance of the dependent variable [3]. That’s the price to pay when dropping variables, but it is convenient when we don’t have many samples for the model to determine variable unique contributions. ##### Fit model without correlated variables ##### ## Pup plots without overallMapRate and plate formula <- ~ (1 | Group) + (1 | Sex) + (1 | flowcell) + mitoRate + overallMapRate + totalAssignedGene varPart <- varPartAnalysis(formula) varPart_data <- varPart[[2]] plot <- varPart[[1]] plot + theme( plot.margin = unit(c(1, 1, 1, 1), "cm"), axis.text.x = element_text(size = (7)), axis.text.y = element_text(size = (7.5)) ) 📝 Exercise 2: Perform a variance partition analysis and determine which of your correlated variables have higher contributions in gene expression variance. Based on that, select a set of variables to model gene expression for DGE. But what does it mean that a variable explains a high percentage of the expression variation of a gene? In the following section will visualize the existing relationships between the gene expression values in the samples and the sample-level variables. 7.3 Examine the expression of most affected genes by each sample variable In the plots presented below we can appreciate the expression levels across samples of the most affected genes by each variable, i.e., the genes for which the variable explains the highest percentages of variance, plotted against the sample values for the same variable. Observe the strong correlations that exist for the sample variables and the gene expression of such affected genes, which ends up causing these variables to explain high percentages of gene expression variation and which obligate us to adjust for them in the models. library("rlang") ## Plot of gene expression lognorm counts vs. sample variable plot_gene_expr <- function(sample_var, gene_id) { colors <- list( "Group" = c("Control" = "brown2", "Experimental" = "deepskyblue3"), "Age" = c("Adult" = "slateblue3", "Pup" = "yellow3"), "Sex" = c("F" = "hotpink1", "M" = "dodgerblue"), "Pregnancy" = c("Yes" = "darkorchid3", "No" = "darkolivegreen4"), "plate" = c("Plate1" = "darkorange", "Plate2" = "lightskyblue", "Plate3" = "deeppink1"), "flowcell" = c( "HKCG7DSXX" = "chartreuse2", "HKCMHDSXX" = "magenta", "HKCNKDSXX" = "turquoise3", "HKCTMDSXX" = "tomato" ) ) ## Lognorm counts of the gene across samples data <- colData(rse_gene_filt) data$gene_expr <- assays(rse_gene_filt)$logcounts[gene_id, ] ## Percentage of variance explained by the variable percentage <- 100 * signif(varPart_data[gene_id, sample_var], digits = 3) ## Boxplots for categorical variables if (class(data[, sample_var]) == "character") { plot <- ggplot(data = as.data.frame(data), mapping = aes( x = !!rlang::sym(sample_var), y = gene_expr, color = !!rlang::sym(sample_var) )) + geom_boxplot(size = 0.25, width = 0.32, color = "black", outlier.color = "#FFFFFFFF") + geom_jitter(width = 0.15, alpha = 1, size = 1) + stat_smooth(geom = "line", alpha = 0.6, size = 0.4, span = 0.3, method = "lm", aes(group = 1), color = "orangered3") + scale_color_manual(values = colors[[sample_var]]) + theme_bw() + guides(color = "none") + labs( title = gene_id, subtitle = paste0("Variance explained: ", percentage, "%"), y = "lognorm counts", x = sample_var ) + theme( axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.title = element_text(hjust = 0.5, size = 7.5, face = "bold"), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } ## Scatterplots for continuous variables else { colors <- c( "mitoRate" = "khaki3", "overallMapRate" = "turquoise", "totalAssignedGene" = "plum2", "rRNA_rate" = "orange3", "sum" = "palegreen3", "detected" = "skyblue2", "ERCCsumLogErr" = "slateblue1" ) plot <- ggplot(as.data.frame(data), aes(x = eval(parse_expr(sample_var)), y = gene_expr)) + geom_point(color = colors[[sample_var]], size = 2) + stat_smooth(geom = "line", alpha = 0.4, size = 0.4, span = 0.25, method = "lm", color = "orangered3") + theme_bw() + guides(color = "none") + labs( title = gene_id, subtitle = paste0("Variance explained: ", percentage, "%"), y = "lognorm counts", x = gsub("_", " ", sample_var) ) + theme( plot.margin = unit(c(0.4, 0.1, 0.4, 0.1), "cm"), axis.title = element_text(size = (7)), axis.text = element_text(size = (6)), plot.title = element_text(hjust = 0.5, size = 7.5, face = "bold"), plot.subtitle = element_text(size = 7, color = "gray40"), legend.text = element_text(size = 6), legend.title = element_text(size = 7) ) } return(plot) } ## Function to plot gene expression vs sample variable data for top 3 most affected genes plot_gene_expr_sample <- function(sample_var) { ## Top 3 genes most affected by sample variable affected_genes <- rownames(varPart_data[order(varPart_data[, sample_var], decreasing = TRUE), ][1:3, ]) ## Plots plots <- list() for (i in 1:length(affected_genes)) { plots[[i]] <- plot_gene_expr(sample_var, affected_genes[i]) } plot_grid(plots[[1]], plots[[2]], plots[[3]], ncol = 3) } ## Plots for top affected genes by 'overallMapRate' plots <- plot_gene_expr_sample("overallMapRate") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) ## Plots for top affected genes by 'totalAssignedGene' plots <- plot_gene_expr_sample("totalAssignedGene") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) ## Plots for top affected genes by 'Group' plots <- plot_gene_expr_sample("Group") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) ## Plots for top affected genes by 'Sex' (genes in sexual chrs) plots <- plot_gene_expr_sample("Sex") plots + theme(plot.margin = unit(c(3, 1, 2, 3), "cm")) p.exercise { background-color: #FFFAFA; padding: 15px; border: 2px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } 📝 Exercise 3: What % of variance does Group explain for the gene ENSMUSG00000042348.10? Create the boxplots for its counts in control and experimental samples. Is it more likely that the gene is up-regulated or down-regulated? 📝 Exercise 4: Do the same for the gene ENSMUSG00000064372.1. What do you observe in terms of variance percentage and sample differences? References Hoffman, G. E., & Schadt, E. E. (2016). variancePartition: interpreting drivers of variation in complex gene expression studies.BMC bioinformatics, 17(1), 1-13. Hoffman, G. (2022). variancePartition: Quantifying and interpreting drivers of variation in multilevel gene expression experiments. van den Berg, S. M. (2022). Analysing data using linear models. Web site: https://bookdown.org/pingapang9/linear_models_bookdown/ Simoiu, C. & Savage, J. (2016). A bag of tips and tricks for dealing with scale issues. Web site: https://rpubs.com/jimsavage/scale_issues "],["differential-gene-expression-exercise.html", "8 Differential gene expression exercise 8.1 Recap 8.2 Exercise", " 8 Differential gene expression exercise Instructor: Leo 8.1 Recap So far we know how to: choose a study from recount3 download data for a study with recount3::create_rse() explore the data interactively with iSEE expand Sequence Read Archive (SRA) attributes sometimes we need to clean them up a bit before we can use them use edgeR::calcNormFactors() to reduce composition bias We didn’t show it explicitly this time build a differential gene expression model with model.matrix() explore and interpret the model with ExploreModelMatrix use limma::voom() and related functions to compute the differential gene expression statistics extract the DEG statistics with limma::topTable(sort.by = \"none\") among several other plots and tools we learned along the way. Alternatively to recount3, we have learned about the RangedSummarizedExperiment objects produced by SPEAQeasy and in particular the one we are using on the smokingMouse project. You might have your own data already. Maybe you have it as an AnnData python object. If so, you can convert it to R with zellkonverter. 8.2 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise option 1: This will be an open ended exercise. Think of it as time to practice what we’ve learnt using data from recount3 or another subset of the smokingMouse dataset. You could also choose to re-run code from earlier parts of the course and ask clarifying questions. You could also use this time to adapt some of the code we’ve covered to use it with your own dataset. If you prefer a more structured exercise: Exercise option 2: Choose two recount3 studies that can be used to study similar research questions. For example, two studies with brain samples across age. Download and process each dataset independently, up to the point where you have differential expression t-statistics for both. Skip most of the exploratory data analyses steps as for the purpose of this exercise, we are most interested in the DEG t-statistics. If you don’t want to choose another recount3 study, you could use the smokingMouse data and subset it once to the pups in nicotine arm of the study and a second time for the pups in the smoking arm of the study. Or you could use the GTEx brain data from recount3, subset it to the prefrontal cortex (PFC), and compute age related expression changes. That would be in addition to SRA study SRP045638 as was showcased in the 2023 version of this course. recount3::create_rse_manual( project = "BRAIN", project_home = "data_sources/gtex", organism = "human", annotation = "gencode_v26", type = "gene" ) Make a scatterplot of the t-statistics between the two datasets to assess correlation / concordance. You might want to use GGally::ggpairs() for this https://ggobi.github.io/ggally/reference/ggpairs.html. Or ggpubr::ggscatter() https://rpkgs.datanovia.com/ggpubr/reference/ggscatter.html. For example, between the GTEx PFC data and the data from SRA study SRP045638 provided by recount3. Or between the nicotine-exposed pups and the smoking-exposed pups in smokingMouse. Or using the two recount3 studies you chose. Are there any DEGs FDR < 5% in both datasets? Or FDR < 5% in dataset 1 that have a p-value < 5% in the other one? You could choose to make a concordance at the top plot like at http://leekgroup.github.io/recount-analyses/example_de/recount_SRP019936.html, though you will likely need more time to complete this. "],["research-talks.html", "9 Research talks 9.1 Fentanyl rat study 9.2 Cg Hb cell projectors study 9.3 deconvolution-benchmark", " 9 Research talks 9.1 Fentanyl rat study Daianna 9.2 Cg Hb cell projectors study Melissa 9.3 deconvolution-benchmark Leonardo Here’s Louise A. Huuki-Myers LIBD seminar if you prefer to watch the recording. "],["biocthis-introduction.html", "10 biocthis introduction 10.1 Related past materials 10.2 biocthis main commands 10.3 Live demo 10.4 Community", " 10 biocthis introduction Instructor: Leo 10.1 Related past materials I’ve taught a lot about biocthis over the years. Here’s a 2020 video: and more recently, these are the LIBD rstats club 2023-03-10 notes. 10.2 biocthis main commands https://bioconductor.org/packages/biocthis pkgdown documentation website: https://lcolladotor.github.io/biocthis/ biocthis::use_bioc_pkg_templates() documentation: https://lcolladotor.github.io/biocthis/reference/use_bioc_pkg_templates.html These are the main steps you will need to know to make a Bioconductor package with biocthis: You first will need to create a package using a command from usethis. For example: usethis::create_package(\"~/Desktop/cshl2024pkg\") Now that you have a package, we can use biocthis to create 4 template R scripts that will guide you and help you make the full structure for a Bioconductor R package. On your new R package (cshl2024pkg), we can now use biocthis::use_bioc_pkg_templates(). In part these commands were born out of my own self interest to make it easier to make new packages instead of copy-pasting the contents of an older one, then manually adjusting all the pieces for a new package. See https://lcolladotor.github.io/pkgs/ for the list of all the R packages I’ve been involved in. 10.3 Live demo Here is the live demo result https://github.com/lcolladotor/cshl2024pkg/ with its companion documentation website at https://lcolladotor.github.io/cshl2024pkg/. You might also want to check the 2023 version at https://github.com/lcolladotor/cshl2024pkg/. Check the git commit history at https://github.com/lcolladotor/cshl2024pkg/commits/devel and the GitHub Actions history at https://github.com/lcolladotor/cshl2024pkg/actions. We can see at https://app.codecov.io/gh/lcolladotor/cshl2024pkg the code coverage results for this demonstration package. 10.3.1 Example function Let’s have a function to work with: weekday_praise(). weekday_praise <- function(date = Sys.Date()) { date <- as.Date(date) date_weekday <- weekdays(date) paste0(date_weekday, ": ", praise::praise()) } weekday_praise() #> [1] "Tuesday: You are super!" weekday_praise("2024-06-09") #> [1] "Sunday: You are dandy!" Here’s the full code for the function and its documentation. #' Praise a weekday #' #' Given a date, figure out which weekday it was, then write a positive #' message. #' #' @param date A `base::Date` object or a `character()` in a format that can be #' converted to a `base::Date` object with `base::as.Date()`. #' #' @importFrom praise praise #' @export #' @examples #' #' ## Praise the current weekday #' weekday_praise() #' #' ## Praise the date we started teaching #' weekday_praise("2024-06-09") #' #' ## Praise the current weekday in a reproducible way #' set.seed(20240610) #' weekday_praise() #' #' ## Verify that it's reproducible #' set.seed(20240610) #' weekday_praise() weekday_praise <- function(date = Sys.Date()) { date <- as.Date(date) date_weekday <- weekdays(date) paste0(date_weekday, ": ", praise::praise()) } Here’s a test for our function too. library("testthat") #> #> Attaching package: 'testthat' #> The following objects are masked from 'package:rlang': #> #> is_false, is_null, is_true #> The following object is masked from 'package:Hmisc': #> #> describe ## Verify that we get the result we wanted set.seed(20240610) expect_equal(weekday_praise("2024-06-09"), "Sunday: You are wondrous!") ## Verify that we get an error if the input is not correct expect_error(weekday_praise("240609")) ## Should work for a vector input expect_equal(length(weekday_praise(c("2024-06-09", "2024-06-10"))), 2L) 10.4 Community For more materials on R/Bioconductor package development check http://contributions.bioconductor.org/. I’m on a Friday night mood now enjoying @lmwebr’s #OSTA workshop 🔥, feeling grateful 🙏🏽 to everyone who nominated me for the #BioC2021 community award 🥇& celebrating 🍺 https://t.co/2oFLdGO3UhSee you in #BioC2022🤞🏽 @Bioconductor #rstats @CDSBMexico https://t.co/0SGHDfiRCs pic.twitter.com/UmM9nMP2W2 — 🇲🇽 Leonardo Collado-Torres (@lcolladotor) August 7, 2021 biocthis is one of the reasons for my 2021 Bioconductor community award :-) Do you want to play an active role? Join the cloud-working-group Slack channel. "],["scrna-seq-data-analysis-overview.html", "11 scRNA-seq data analysis overview 11.1 Single cell RNA sequencing 11.2 Basic Workflow 11.3 The SingleCellExperiment class 11.4 Quality Control 11.5 Normalization 11.6 Feature selection 11.7 Dimensionality reduction 11.8 Clustering 11.9 Marker gene detection 11.10 Cell type annotation 11.11 Getting ready again 11.12 References", " 11 scRNA-seq data analysis overview Instructor: Melissa Mayén Quiroz Adapted from: OSCA: Basics of Single-Cell Analysis with Bioconductor 11.1 Single cell RNA sequencing Single-cell RNA sequencing (scRNA-seq) is a cutting-edge technology used to analyze the gene expression profiles of individual cells. Unlike traditional bulk RNA sequencing, which provides an average expression profile of a population of cells, scRNA-seq allows researchers to study the gene expression patterns of single cells. Cell heterogeneity Cell type identification Cell state dynamics Orchestrating Single-Cell Analysis with Bioconductor Authors: Robert Amezquita [aut], Aaron Lun [aut], Stephanie Hicks [aut], Raphael Gottardo [aut], Alan O’Callaghan [cre] 11.1.1 Pre-processing of scRNA-seq Data (Before R) Quality Control of the reads (FastQC): Assess the quality of raw sequencing reads. Check GC content, overrepresented sequences, presence of N bases, and other quality metrics. Alignment to Reference Transcriptome: Align sequencing reads to a reference transcriptome. Generate aligned read files. Generation of Expression Count Matrix: Quantify gene expression levels by counting the number of reads mapped to each gene. Create a matrix with genes as rows and cells as columns, where each entry represents the count of reads for a specific gene in a specific cell. For 10x Genomics data, the Cellranger software suite (Zheng et al. 2017) provides a custom pipeline to obtain a count matrix. This uses STAR to align reads to the reference genome and then counts the number of unique UMIs mapped to each gene. 11.1.2 Different Technologies Droplet-based: 10x Genomics, inDrop, Drop-seq Plate-based with unique molecular identifiers (UMIs): CEL-seq, MARS-seq Plate-based with reads: Smart-seq2 Other: sci-RNA-seq, Seq-Well In practical terms, droplet-based technologies are the current de facto standard due to their throughput and low cost per cell. Plate-based methods can capture other phenotypic information (e.g., morphology) and are more amenable to customization. Read-based methods provide whole-transcript coverage, which is useful in some applications (e.g., splicing, exome mutations); otherwise, UMI-based methods are more popular as they mitigate the effects of PCR amplification noise. 11.2 Basic Workflow In the simplest case, the workflow has the following form: We compute quality control metrics to remove low-quality cells that would interfere with downstream analyses. These cells may have been damaged during processing or may not have been fully captured by the sequencing protocol. Common metrics includes the total counts per cell, the proportion of spike-in or mitochondrial reads and the number of detected features. We convert the counts into normalized expression values to eliminate cell-specific biases (e.g., in capture efficiency). This allows us to perform explicit comparisons across cells in downstream steps like clustering. We also apply a transformation, typically log, to adjust for the mean-variance relationship. We perform feature selection to pick a subset of interesting features for downstream analysis. This is done by modelling the variance across cells for each gene and retaining genes that are highly variable. The aim is to reduce computational overhead and noise from uninteresting genes. We apply dimensionality reduction to compact the data and further reduce noise. Principal components analysis is typically used to obtain an initial low-rank representation for more computational work, followed by more aggressive methods like t-stochastic neighbor embedding for visualization purposes. We cluster cells into groups according to similarities in their (normalized) expression profiles. This aims to obtain groupings that serve as empirical proxies for distinct biological states. We typically interpret these groupings by identifying differentially expressed marker genes between clusters. 11.3 The SingleCellExperiment class This object is specifically designed to store and analyze single-cell RNA sequencing (scRNA-seq) data. It extends the SummarizedExperiment class to include specialized features for single-cell data, such as cell identifiers, dimensionality reduction results, and methods for quality control and normalization. Assay Data: The primary data matrix containing gene expression values or other measurements. Rows represent genes and columns represent cells. colData (Column Metadata): Additional information about each cell, such as cell type, experimental condition, or any other relevant metadata. rowData (Row Metadata): Additional information about each gene, such as gene symbols, genomic coordinates, or functional annotations. reducedDims: Dimensionality reduction results, such as “principal component analysis” (PCA), “t-distributed stochastic neighbor embedding” (t-SNE), and “Uniform Manifold Approximation and Projection” (UMAP), used for visualizing and clustering cells. altExpNames and altExps: Names of alternative experiments (such as spike-in control genes used for normalization) and alternative experiment counts matrices. metadata: Additional metadata about the experiment. 11.3.1 Data Loading The Lun et al. (2017) dataset contains two 96-well plates of 416B cells (an immortalized mouse myeloid progenitor cell line), processed using the Smart-seq2 protocol (Picelli et al. 2014). A constant amount of spike-in RNA from the External RNA Controls Consortium (ERCC) was also added to each cell’s lysate prior to library preparation. library("scRNAseq") library("SingleCellExperiment") library("AnnotationHub") library("scater") ## Load the data set sce.416b <- LunSpikeInData(which = "416b") #> downloading 1 resources #> retrieving 1 resource #> loading from cache #> require("ensembldb") ## We convert the blocking factor to a factor so that downstream steps do not treat it as an integer. sce.416b$block <- factor(sce.416b$block) ## rename the rows with the symbols, reverting to Ensembl identifiers ens.mm.v97 <- AnnotationHub()[["AH73905"]] #> loading from cache rowData(sce.416b)$ENSEMBL <- rownames(sce.416b) rowData(sce.416b)$SYMBOL <- mapIds(ens.mm.v97, keys = rownames(sce.416b), keytype = "GENEID", column = "SYMBOL" ) #> Warning: Unable to map 563 of 46604 requested IDs. rowData(sce.416b)$SEQNAME <- mapIds(ens.mm.v97, keys = rownames(sce.416b), keytype = "GENEID", column = "SEQNAME" ) #> Warning: Unable to map 563 of 46604 requested IDs. rownames(sce.416b) <- uniquifyFeatureNames( rowData(sce.416b)$ENSEMBL, rowData(sce.416b)$SYMBOL ) 11.3.2 Basics of your SCE ## Look at your SCE sce.416b #> class: SingleCellExperiment #> dim: 46604 192 #> metadata(0): #> assays(1): counts #> rownames(46604): 4933401J01Rik Gm26206 ... CAAA01147332.1 CBFB-MYH11-mcherry #> rowData names(4): Length ENSEMBL SYMBOL SEQNAME #> colnames(192): SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 ... #> SLX-11312.N712_S508.H5H5YBBXX.s_8.r_1 SLX-11312.N712_S517.H5H5YBBXX.s_8.r_1 #> colData names(8): cell line cell type ... spike-in addition block #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(2): ERCC SIRV ## Get in the slot "assay", in the count matrix ## [genes, cells] assay(sce.416b, "counts")[110:113, 1:2] # gene, cell #> 4 x 2 sparse Matrix of class "dgCMatrix" #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 #> 1700034P13Rik . . #> Sgk3 8 . #> Gm6195 2 3 #> Gm22607 . . ## We can do it like this too counts(sce.416b)[110:113, 1:2] #> 4 x 2 sparse Matrix of class "dgCMatrix" #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 #> 1700034P13Rik . . #> Sgk3 8 . #> Gm6195 2 3 #> Gm22607 . . ## We could add more assays to our SCE sce.416b <- logNormCounts(sce.416b) sce.416b #> class: SingleCellExperiment #> dim: 46604 192 #> metadata(0): #> assays(2): counts logcounts #> rownames(46604): 4933401J01Rik Gm26206 ... CAAA01147332.1 CBFB-MYH11-mcherry #> rowData names(4): Length ENSEMBL SYMBOL SEQNAME #> colnames(192): SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 ... #> SLX-11312.N712_S508.H5H5YBBXX.s_8.r_1 SLX-11312.N712_S517.H5H5YBBXX.s_8.r_1 #> colData names(9): cell line cell type ... block sizeFactor #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(2): ERCC SIRV ## Acces to the column names (cell identifyers) head(colnames(sce.416b)) #> [1] "SLX-9555.N701_S502.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S503.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S504.C89V9ANXX.s_1.r_1" #> [4] "SLX-9555.N701_S505.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S506.C89V9ANXX.s_1.r_1" "SLX-9555.N701_S507.C89V9ANXX.s_1.r_1" ## Acces to the column data (cell information) head(colData(sce.416b)) #> DataFrame with 6 rows and 9 columns #> cell line cell type single cell well quality genotype #> <character> <character> <character> <character> #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> SLX-9555.N701_S504.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> SLX-9555.N701_S505.C89V9ANXX.s_1.r_1 416B embryonic stem cell OK Doxycycline-inducibl.. #> phenotype strain spike-in addition block sizeFactor #> <character> <character> <character> <factor> <numeric> #> SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 wild type phenotype B6D2F1-J ERCC+SIRV 20160113 0.742741 #> SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 wild type phenotype B6D2F1-J ERCC+SIRV 20160113 0.923157 #> SLX-9555.N701_S504.C89V9ANXX.s_1.r_1 wild type phenotype B6D2F1-J ERCC+SIRV 20160113 1.012242 #> SLX-9555.N701_S505.C89V9ANXX.s_1.r_1 induced CBFB-MYH11 o.. B6D2F1-J ERCC+SIRV 20160113 1.151585 #> [ reached getOption("max.print") -- omitted 2 rows ] ## Acces to the row names (gene names) head(rownames(sce.416b)) #> [1] "4933401J01Rik" "Gm26206" "Xkr4" "Gm18956" "Gm37180" "Gm37363" ## Acces to the row data (gene information) head(rowData(sce.416b)) #> DataFrame with 6 rows and 4 columns #> Length ENSEMBL SYMBOL SEQNAME #> <integer> <character> <character> <character> #> 4933401J01Rik 1070 ENSMUSG00000102693 4933401J01Rik 1 #> Gm26206 110 ENSMUSG00000064842 Gm26206 1 #> Xkr4 6094 ENSMUSG00000051951 Xkr4 1 #> Gm18956 480 ENSMUSG00000102851 Gm18956 1 #> Gm37180 2819 ENSMUSG00000103377 Gm37180 1 #> Gm37363 2233 ENSMUSG00000104017 Gm37363 1 ## We can create another SCE subsetitng the first one sce_2 <- sce.416b[110:130, 1:2] sce_2 #> class: SingleCellExperiment #> dim: 21 2 #> metadata(0): #> assays(2): counts logcounts #> rownames(21): 1700034P13Rik Sgk3 ... Gm38005 Gm15604 #> rowData names(4): Length ENSEMBL SYMBOL SEQNAME #> colnames(2): SLX-9555.N701_S502.C89V9ANXX.s_1.r_1 SLX-9555.N701_S503.C89V9ANXX.s_1.r_1 #> colData names(9): cell line cell type ... block sizeFactor #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(2): ERCC SIRV As in the SummarizedExperiment, $ is the operator used to access a specific column within the cell metadata. That is, it’s a shortcut for colData(obj)$. head(sce.416b$`cell type`) #> [1] "embryonic stem cell" "embryonic stem cell" "embryonic stem cell" "embryonic stem cell" "embryonic stem cell" #> [6] "embryonic stem cell" Now, we will look at the dimension reductions ## This is empty reducedDimNames(sce_2) #> character(0) ## Compute PCA sce_2 <- runPCA(sce_2) #> Warning in check_numbers(k = k, nu = nu, nv = nv, limit = min(dim(x)) - : more singular values/vectors requested than #> available #> Warning in (function (A, nv = 5, nu = nv, maxit = 1000, work = nv + 7, reorth = TRUE, : You're computing too large a #> percentage of total singular values, use a standard svd instead. ## Check again reducedDimNames(sce_2) #> [1] "PCA" 11.4 Quality Control Low-quality libraries in scRNA-seq data can arise from a variety of sources such as cell damage during dissociation or failure in library preparation (e.g., inefficient reverse transcription or PCR amplification). These usually manifest as “cells” with low total counts, few expressed genes and high mitochondrial or spike-in proportions. These low-quality libraries are problematic as they can contribute to misleading results in downstream analyses. 11.4.1 Common choices of QC metrics For each cell, we calculate these QC metrics using the perCellQCMetrics() function from the scater package (McCarthy et al. 2017). The sum column contains the total count for each cell and the detected column contains the number of detected genes. The subsets_Mito_percent column contains the percentage of reads mapped to mitochondrial transcripts. Finally, the altexps_ERCC_percent column contains the percentage of reads mapped to ERCC transcripts. library("scuttle") ## Identify mitochondrial genes (those with SEQNAME equal to "MT") in the row data mito <- which(rowData(sce.416b)$SEQNAME == "MT") ## Compute per-cell QC metrics, including a subset for mitochondrial genes stats <- perCellQCMetrics(sce.416b, subsets = list(Mt = mito)) summary(stats$sum) # total library sizes for all cells #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 27084 856350 1111252 1165865 1328301 4398883 summary(stats$detected) # detected features (genes) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 5609 7502 8341 8397 9208 11380 summary(stats$subsets_Mt_percent) # percentage of reads mapping to mitochondrial genes #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 4.593 7.294 8.139 8.146 9.035 15.617 summary(stats$altexps_ERCC_percent) # percentage of reads mapping to spike-in controls #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 2.242 4.291 6.031 6.412 8.126 19.429 Alternatively, users may prefer to use the addPerCellQC() function. This computes and appends the per-cell QC statistics to the colData of the SingleCellExperiment object, allowing us to retain all relevant information in a single object for later manipulation. ## Compute addPerCellQCMetrics, including a subset for mitochondrial genes sce.416b <- addPerCellQCMetrics(sce.416b, subsets = list(Mito = mito)) colnames(colData(sce.416b)) #> [1] "cell line" "cell type" "single cell well quality" "genotype" #> [5] "phenotype" "strain" "spike-in addition" "block" #> [9] "sizeFactor" "sum" "detected" "subsets_Mito_sum" #> [13] "subsets_Mito_detected" "subsets_Mito_percent" "altexps_ERCC_sum" "altexps_ERCC_detected" #> [17] "altexps_ERCC_percent" "altexps_SIRV_sum" "altexps_SIRV_detected" "altexps_SIRV_percent" #> [21] "total" A key assumption here is that the QC metrics are independent of the biological state of each cell. Poor values (e.g., low library sizes, high mitochondrial proportions) are presumed to be driven by technical factors rather than biological processes, meaning that the subsequent removal of cells will not misrepresent the biology in downstream analyses. 11.4.2 Identifying low-quality cells 11.4.2.1 With fixed thresholds The simplest approach to identifying low-quality cells involves applying fixed thresholds to the QC metrics. For example, we might consider cells to be low quality if they have library sizes below 100,000 reads; express fewer than 5,000 genes; have spike-in proportions above 10%; or have mitochondrial proportions above 10%. ## Using our previous perCellQCMetrics data: ## Identify cells with a total library size (sum of counts) less than 100,000 c.lib <- stats$sum < 1e5 ## Identify cells with fewer than 5,000 detected features (genes) qc.nexprs <- stats$detected < 5e3 ## Identify cells with more than 10% of reads mapping to spike-in controls (e.g., ERCC) qc.spike <- stats$altexps_ERCC_percent > 10 ## Identify cells with more than 10% of reads mapping to mitochondrial genes qc.mito <- stats$subsets_Mt_percent > 10 ## Create a combined logical vector that marks cells to discard if they meet any of the above criteria discard <- c.lib | qc.nexprs | qc.spike | qc.mito ## Summarize the number of cells removed for each reason. DataFrame( LibSize = sum(c.lib), # Number of cells removed due to low library size NExprs = sum(qc.nexprs), # Number of cells removed due to low number of detected features SpikeProp = sum(qc.spike), # Number of cells removed due to high spike-in proportion MitoProp = sum(qc.mito), # Number of cells removed due to high mitochondrial proportion Total = sum(discard) # Total number of cells removed ) #> DataFrame with 1 row and 5 columns #> LibSize NExprs SpikeProp MitoProp Total #> <integer> <integer> <integer> <integer> <integer> #> 1 3 0 19 14 33 While simple, this strategy requires considerable experience to determine appropriate thresholds for each experimental protocol and biological system. Thresholds for read count-based data are not applicable for UMI-based data, and vice versa. Differences in mitochondrial activity or total RNA content require constant adjustment of the mitochondrial and spike-in thresholds, respectively, for different biological systems. Indeed, even with the same protocol and system, the appropriate threshold can vary from run to run due to the vagaries of cDNA capture efficiency and sequencing depth per cell. 11.4.2.2 With adaptive threshold Here, we assume that most of the dataset consists of high-quality cells. We then identify cells that are outliers for the various QC metrics, based on the median absolute deviation (MAD) from the median value of each metric across all cells. By default, we consider a value to be an outlier if it is more than 3 MADs from the median in the “problematic” direction. We can do that using the perCellQCFilters() function. It will allow to identify cells with log-transformed library sizes that are more than 3 MADs below the median. A log-transformation is used to improve resolution at small values when type = \"lower\" and to avoid negative thresholds that would be meaningless for a non-negative metric. perCellQCFilters() will also identify outliers for the proportion-based metrics specified in the sub.fields= arguments. These distributions frequently exhibit a heavy right tail, but unlike the two previous metrics, it is the right tail itself that contains the putative low-quality cells. Thus, we do not perform any transformation to shrink the tail - rather, our hope is that the cells in the tail are identified as large outliers. A cell that is an outlier for any of these metrics is considered to be of low quality and discarded. This is captured in the discard column, which can be used for later filtering ## Identify cells that are outlier reasons <- perCellQCFilters(stats, sub.fields = c("subsets_Mt_percent", "altexps_ERCC_percent") ) # No transformation colSums(as.matrix(reasons)) #> low_lib_size low_n_features high_subsets_Mt_percent high_altexps_ERCC_percent #> 4 0 2 1 #> discard #> 6 ## Extract the exact filter thresholds attr(reasons$low_lib_size, "thresholds") #> lower higher #> 434082.9 Inf attr(reasons$low_n_features, "thresholds") #> lower higher #> 5231.468 Inf With this strategy, the thresholds adapt to both the location and spread of the distribution of values for a given metric. This allows the QC procedure to adjust to changes in sequencing depth, cDNA capture efficiency, mitochondrial content, etc. without requiring any user intervention or prior experience. However, the underlying assumption of a high-quality majority may not always be appropriate 11.4.3 Checking diagnostic plots It is good practice to inspect the distributions of QC metrics to identify possible problems. In the most ideal case, we would see normal distributions that would justify the 3 MAD threshold used in outlier detection. A large proportion of cells in another mode suggests that the QC metrics might be correlated with some biological state, potentially leading to the loss of distinct cell types during filtering; or that there were inconsistencies with library preparation for a subset of cells, a not-uncommon phenomenon in plate-based protocols. library("scater") ## Add the information to the SCE columns colData(sce.416b) <- cbind(colData(sce.416b), stats) sce.416b$block <- factor(sce.416b$block) sce.416b$phenotype <- ifelse(grepl("induced", sce.416b$phenotype), "induced", "wild type") sce.416b$discard <- reasons$discard ## Plot gridExtra::grid.arrange( ## Diccard low total counts plotColData(sce.416b, x = "block", y = "sum", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + scale_y_log10() + ggtitle("Total count"), ## Discard low detected genes plotColData(sce.416b, x = "block", y = "detected", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + scale_y_log10() + ggtitle("Detected features"), ## Discard high mitocondrial percentage plotColData(sce.416b, x = "block", y = "subsets_Mito_percent", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + ggtitle("Mito percent"), ## Discard high plotColData(sce.416b, x = "block", y = "altexps_ERCC_percent", colour_by = "discard", other_fields = "phenotype" ) + facet_wrap(~phenotype) + ggtitle("ERCC percent"), ncol = 1 ) You can also create some plots via iSEE :) p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Optional:: Create at least 1 QC plot using iSEE. Clue: Use the Column Data Plot 1 panel library("iSEE") iSEE(sce.416b) 11.4.4 Removing low-quality cells Once low-quality cells have been identified, we can choose to either remove them or mark them. Removal is the most straightforward option and is achieved by subsetting the SingleCellExperiment by column. In this case, we use the previous low-quality calls to generate a subsetted SingleCellExperiment that we would use for downstream analyses. ## Keep the columns we DON'T want to discard. filtered <- sce.416b[, !reasons$discard] Other option is to simply mark the low-quality cells as such and retain them in the downstream analysis. 11.5 Normalization Systematic differences in sequencing coverage between libraries are often observed in single-cell RNA sequencing data which typically arise from technical differences in cDNA capture or PCR amplification efficiency across cells, attributable to the difficulty of achieving consistent library preparation. Normalization aims to remove these differences such that they do not interfere with comparisons of the expression profiles between cells. This will ensure that any observed heterogeneity or differential expression within the cell population are driven by biology and not technical biases. Let´s load before another dataset and review quickly what we have learned. library("scRNAseq") library("scater") ## Load dataset sce.zeisel <- ZeiselBrainData() sce.zeisel <- aggregateAcrossFeatures(sce.zeisel, ids = sub("_loc[0-9]+$", "", rownames(sce.zeisel)) ) ## Compute perCellQCMetrics stats <- perCellQCMetrics(sce.zeisel, subsets = list( Mt = rowData(sce.zeisel)$featureType == "mito" )) ## Compute quickPerCellQC qc <- quickPerCellQC(stats, percent_subsets = c( "altexps_ERCC_percent", "subsets_Mt_percent" )) ## Discard low quality cells sce.zeisel <- sce.zeisel[, !qc$discard] Scaling normalization Scaling normalization is the simplest and most commonly used class of normalization strategies. This involves dividing all counts for each cell by a cell-specific scaling factor, often called a “size factor” (Anders and Huber 2010). The assumption here is that any cell-specific bias (e.g., in capture or amplification efficiency) affects all genes equally via scaling of the expected mean count for that cell. The size factor for each cell represents the estimate of the relative bias in that cell, so division of its counts by its size factor should remove that bias. 11.5.1 Library size normalization Library size normalization is the simplest strategy for performing scaling normalization. We define the library size as the total sum of counts across all genes for each cell, the expected value of which is assumed to scale with any cell-specific biases. The “library size factor” for each cell is then directly proportional to its library size where the proportionality constant is defined such that the mean size factor across all cells is equal to 1. This definition ensures that the normalized expression values are on the same scale as the original counts, which is useful for interpretation (especially when dealing with transformed data). library("scater") ## Compute librarySizeFactors lib.sf.zeisel <- librarySizeFactors(sce.zeisel) summary(lib.sf.zeisel) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.1757 0.5680 0.8680 1.0000 1.2783 4.0839 In the Zeisel brain data, the library size factors differ by up to 10-fold across cells. This is typical of the variability in coverage in scRNA-seq data. ## Plot the library size factors differences hist(log10(lib.sf.zeisel), xlab = "Log10[Size factor]", col = "grey80") Strictly speaking, the use of library size factors assumes that there is no “imbalance” in the differentially expressed (DE) genes between any pair of cells. Although, in practice, normalization accuracy is not a major consideration for exploratory scRNA-seq data analyses. Composition biases do not usually affect the separation of clusters, only the magnitude - and to a lesser extent, direction - of the log-fold changes between clusters or cell types 11.5.2 Normalization by deconvolution composition biases will be present when any unbalanced differential expression exists between samples. Consider the simple example of two cells where a single gene “X” is upregulated in one cell “A” compared to the other cell “B”. This upregulation means that either more sequencing resources are devoted to “X in”A”, thus decreasing coverage of all other non-DE genes when the total library size of each cell is experimentally fixed; or the library size of “A” increases when “X” is assigned more reads or UMIs. The removal of composition biases is a well-studied problem for bulk RNA sequencing data analysis. - estimateSizeFactorsFromMatrix() function in the DESeq2 package (Anders and Huber 2010; Love, Huber, and Anders 2014) - calcNormFactors()function in the edgeR package (Robinson and Oshlack 2010). Single-cell data can be problematic for these bulk normalization methods due to the dominance of low and zero counts. To overcome this, we pool counts from many cells to increase the size of the counts for accurate size factor estimation (Lun, Bach, and Marioni 2016). Pool-based size factors are then “deconvolved” into cell-based factors for normalization of each cell’s expression profile. This is performed using the calculateSumFactors() function from scran. First we have a pre-clustering step with quickCluster() where cells in each cluster are normalized separately and the size factors are rescaled to be comparable across clusters. This avoids the assumption that most genes are non-DE across the entire population - only a non-DE majority is required between pairs of clusters, which is a weaker assumption for highly heterogeneous populations. library("scran") ## Compute quickCluster + calculateSumFactor for deconvolution normalization set.seed(100) clust.zeisel <- quickCluster(sce.zeisel) table(clust.zeisel) #> clust.zeisel #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 #> 170 254 441 178 393 148 219 240 189 123 112 103 135 111 deconv.sf.zeisel <- calculateSumFactors(sce.zeisel, clusters = clust.zeisel) summary(deconv.sf.zeisel) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.1186 0.4860 0.8314 1.0000 1.3209 4.5090 11.5.3 Normalization by spike-ins Spike-in normalization is based on the assumption that the same amount of spike-in RNA was added to each cell, so, systematic differences in the coverage of the spike-in transcripts can only be due to cell-specific biases, e.g., in capture efficiency or sequencing depth. To remove these biases, we equalize spike-in coverage across cells by scaling with “spike-in size factors”. Compared to the previous methods, spike-in normalization requires no assumption about the biology of the system. Practically, spike-in normalization should be used if differences in the total RNA content of individual cells are of interest and must be preserved in downstream analyses. To demonstrate the use of spike-in normalization on a different dataset involving T cell activation after stimulation with T cell receptor ligands of varying affinity (Richard et al. 2018). library("scRNAseq") sce.richard <- RichardTCellData() #> loading from cache sce.richard <- sce.richard[, sce.richard$`single cell quality` == "OK"] sce.richard #> class: SingleCellExperiment #> dim: 46603 528 #> metadata(0): #> assays(1): counts #> rownames(46603): ENSMUSG00000102693 ENSMUSG00000064842 ... ENSMUSG00000096730 ENSMUSG00000095742 #> rowData names(0): #> colnames(528): SLX-12611.N701_S502. SLX-12611.N702_S502. ... SLX-12612.i712_i522. SLX-12612.i714_i522. #> colData names(13): age individual ... stimulus time #> reducedDimNames(0): #> mainExpName: endogenous #> altExpNames(1): ERCC We apply the computeSpikeFactors() method to estimate spike-in size factors for all cells. This is defined by converting the total spike-in count per cell into a size factor, using the same reasoning as in librarySizeFactors(). (Scaling will subsequently remove any differences in spike-in coverage across cells). ## computeSpikeFactors() to estimate spike-in size factors sce.richard <- computeSpikeFactors(sce.richard, "ERCC") summary(sizeFactors(sce.richard)) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.1247 0.4282 0.6274 1.0000 1.0699 23.3161 11.5.4 Scaling and log-transforming Once we have computed the size factors, we use the logNormCounts() function from scater to compute normalized expression values for each cell. This is done by dividing the count for each gene/spike-in transcript with the appropriate size factor for that cell. The function also log-transforms the normalized values, creating a new assay called \"logcounts\". (Technically, these are “log-transformed normalized expression values”). ## Compute normalized expression values and log-transformation sce.zeisel <- logNormCounts(sce.zeisel) assayNames(sce.zeisel) #> [1] "counts" "logcounts" The log-transformation is useful as differences in the log-values represent log-fold changes in expression. By operating on log-transformed data, we ensure that these procedures are measuring distances between cells based on log-fold changes in expression. Log-transformation focuses on the former by promoting contributions from genes with strong relative differences. 11.6 Feature selection highly variable genes (HVGs) We often use scRNA-seq data in exploratory analyses to characterize heterogeneity across cells. Procedures like clustering and dimensionality reduction compare cells based on their gene expression profiles, which involves aggregating per-gene differences into a single (dis)similarity metric between a pair of cells. The choice of genes to use in this calculation has a major impact on the behavior of the metric and the performance of downstream methods. We want to select genes that contain useful information about the biology of the system while removing genes that contain random noise. This aims to preserve interesting biological structure without the variance that obscures that structure, and to reduce the size of the data to improve computational efficiency of later steps. The simplest approach to feature selection is to select the most variable genes based on their expression across the population. This assumes that genuine biological differences will manifest as increased variation in the affected genes, compared to other genes that are only affected by technical noise or a baseline level of “uninteresting” biological variation. 11.6.1 Quantifying per-gene variation The simplest approach to quantifying per-gene variation is to compute the variance of the log-normalized expression values (“log-counts”) for each gene across all cells (A. T. L. Lun, McCarthy, and Marioni 2016). The advantage of this approach is that the feature selection is based on the same log-values that are used for later downstream steps. In particular, genes with the largest variances in log-values will contribute most to the Euclidean distances between cells during procedures like clustering and dimensionality reduction. By using log-values here, we ensure that our quantitative definition of heterogeneity is consistent throughout the entire analysis. Calculation of the per-gene variance is simple, but feature selection requires modelling of the mean-variance relationship. The log-transformation is not a variance stabilizing transformation in most cases, which means that the total variance of a gene is driven more by its abundance than its underlying biological heterogeneity. To account for this effect, we use the modelGeneVar() function to fit a trend to the variance with respect to abundance across all genes (Figure 3.1). library("scran") ## Model the mean-variance relationship dec.zeisel <- modelGeneVar(sce.zeisel) ## Plot the fit fit.zeisel <- metadata(dec.zeisel) plot(fit.zeisel$mean, fit.zeisel$var, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) curve(fit.zeisel$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) At any given abundance, we assume that the variation in expression for most genes is driven by uninteresting processes like sampling noise. Under this assumption, the fitted value of the trend at any given gene’s abundance represents an estimate of its uninteresting variation, which we call the technical component. We then define the biological component for each gene as the difference between its total variance and the technical component. This biological component represents the “interesting” variation for each gene and can be used as the metric for HVG selection. ## Order by most interesting genes for inspection dec.zeisel[order(dec.zeisel$bio, decreasing = TRUE), ] #> DataFrame with 19839 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Plp1 3.86637 15.44422 1.52686 13.91736 0.00000e+00 0.00000e+00 #> Trf 2.27790 9.95350 1.42611 8.52739 7.65644e-200 7.59060e-196 #> Mal 2.30761 9.19548 1.42963 7.76586 2.43043e-165 1.60635e-161 #> Apod 1.89630 7.78119 1.36339 6.41780 9.80407e-125 4.85988e-121 #> Mog 1.84701 7.30188 1.35204 5.94985 2.65362e-109 8.76934e-106 #> ... ... ... ... ... ... ... #> Ddx5 3.71905 0.762844 1.54756 -0.784711 0.994702 0.997822 #> [ reached getOption("max.print") -- omitted 4 rows ] 11.6.2 Quantifying technical noise (spike-ins) The assumptions made by quantifying per-gene variation may be problematic in rare scenarios where many genes at a particular abundance are affected by a biological process. For example, strong upregulation of cell type-specific genes may result in an enrichment of HVGs at high abundances. This would inflate the fitted trend in that abundance interval and compromise the detection of the relevant genes. We can avoid this problem by fitting a mean-dependent trend to the variance of the spike-in transcripts, if they are available. The premise here is that spike-ins should not be affected by biological variation, so the fitted value of the spike-in trend should represent a better estimate of the technical component for each gene. ## Fit a mean-dependent trend to the variance of the spike-in transcripts dec.spike.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC") ## Order by most interesting genes for inspection dec.spike.416b[order(dec.spike.416b$bio, decreasing = TRUE), ] #> DataFrame with 46604 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Lyz2 6.53871 13.5804 1.61389 11.9665 3.42011e-189 2.91162e-186 #> Top2a 5.78145 14.2104 2.56868 11.6418 4.26367e-72 1.20992e-69 #> Ccnb2 5.89731 13.6178 2.39162 11.2262 3.53970e-77 1.09579e-74 #> Ccl9 6.70039 12.4793 1.44789 11.0314 1.27204e-199 1.26341e-196 #> Hbb-bt 4.95051 15.0336 4.02525 11.0083 1.50804e-27 1.03594e-25 #> ... ... ... ... ... ... ... #> Rpl5-ps2 3.50523 0.831793 6.43592 -5.60413 0.999712 0.999857 #> [ reached getOption("max.print") -- omitted 4 rows ] ## Plot the fit plot(dec.spike.416b$mean, dec.spike.416b$total, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) fit.spike.416b <- metadata(dec.spike.416b) points(fit.spike.416b$mean, fit.spike.416b$var, col = "red", pch = 16) curve(fit.spike.416b$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) 11.6.3 Quantifying technical noise (mean-variance trend) In the absence of spike-in data, one can attempt to create a trend by making some distributional assumptions about the noise. For example, UMI counts typically exhibit near-Poisson variation if we only consider technical noise from library preparation and sequencing. This can be used to construct a mean-variance trend in the log-counts with the modelGeneVarByPoisson() function. ## construct a mean-variance trend in the log-counts set.seed(0010101) dec.pois.zeisel <- modelGeneVarByPoisson(sce.zeisel) ## Order by most interesting genes for inspection dec.pois.zeisel <- dec.pois.zeisel[order(dec.pois.zeisel$bio, decreasing = TRUE), ] head(dec.pois.zeisel) #> DataFrame with 6 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Plp1 3.86637 15.44422 0.226375 15.21785 0 0 #> Trf 2.27790 9.95350 0.635655 9.31785 0 0 #> Mal 2.30761 9.19548 0.626251 8.56923 0 0 #> Apod 1.89630 7.78119 0.748055 7.03313 0 0 #> Mog 1.84701 7.30188 0.760426 6.54146 0 0 #> Mbp 2.20638 6.74997 0.658112 6.09186 0 0 ## Plot the fit plot(dec.pois.zeisel$mean, dec.pois.zeisel$total, pch = 16, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) curve(metadata(dec.pois.zeisel)$trend(x), col = "dodgerblue", add = TRUE) Trends based purely on technical noise tend to yield large biological components for highly-expressed genes. This often includes so-called “house-keeping” genes coding for essential cellular components such as ribosomal proteins, which are considered uninteresting for characterizing cellular heterogeneity. These observations suggest that a more accurate noise model does not necessarily yield a better ranking of HVGs. Though, one should keep an open mind that house-keeping genes are regularly DE in a variety of conditions 11.6.4 Handling batch effects Data containing multiple batches will often exhibit batch effects. We are usually not interested in HVGs that are driven by batch effects; instead, we want to focus on genes that are highly variable within each batch. This is naturally achieved by performing trend fitting and variance decomposition separately for each batch. We will try now this approach by treating each plate (block) in the 416B dataset as a different batch, using the modelGeneVarWithSpikes() function. (The same argument is available in all other variance-modelling functions.) ## Fit a mean-dependent trend to the variance of the spike-in transcripts ## Independently for each batch (block) dec.block.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC", block = sce.416b$block) # block=sce.416b$block head(dec.block.416b[order(dec.block.416b$bio, decreasing = TRUE), 1:6]) #> DataFrame with 6 rows and 6 columns #> mean total tech bio p.value FDR #> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> #> Lyz2 6.53871 13.5779 1.63259 11.9453 0.00000e+00 0.00000e+00 #> Top2a 5.78145 13.9852 2.77254 11.2126 6.95319e-238 2.07179e-235 #> Ccl9 6.70039 12.5449 1.40416 11.1408 0.00000e+00 0.00000e+00 #> Hbb-bt 4.95051 15.0506 4.07362 10.9770 7.34341e-90 5.66488e-88 #> Ccnb2 5.89731 13.3673 2.60730 10.7600 1.02751e-282 3.88775e-280 #> Cd200r3 4.81056 14.9435 4.31950 10.6240 8.48013e-76 5.20981e-74 ## Plot the fit by batch (block) par(mfrow = c(1, 2)) blocked.stats <- dec.block.416b$per.block for (i in colnames(blocked.stats)) { current <- blocked.stats[[i]] plot(current$mean, current$total, main = i, pch = 16, cex = 0.5, xlab = "Mean of log-expression", ylab = "Variance of log-expression" ) curfit <- metadata(current) points(curfit$mean, curfit$var, col = "red", pch = 16) curve(curfit$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) } The use of a batch-specific trend fit is useful as it accommodates differences in the mean-variance trends between batches. This is especially important if batches exhibit systematic technical differences, e.g., differences in coverage or in the amount of spike-in RNA added. 11.6.5 Selecting highly variable genes Once we have quantified the per-gene variation, the next step is to select the subset of HVGs to use in downstream analyses. A larger subset will reduce the risk of discarding interesting biological signal by retaining more potentially relevant genes, at the cost of increasing noise from irrelevant genes that might obscure said signal. It is difficult to determine the optimal trade-off for any given application as noise in one context may be useful signal in another. The most obvious selection strategy is to take the top “n” genes with the largest values for the relevant variance metric. The main advantage of this approach is that the user can directly control the number of genes retained, which ensures that the computational complexity of downstream calculations is easily predicted. For modelGeneVar() and modelGeneVarWithSpikes(), we would select the genes with the largest biological components. This is conveniently done for us via getTopHVgs(), as shown with n = 1000. ## Top 1000 genes hvg.zeisel.var <- getTopHVGs(dec.zeisel, n = 1000) str(hvg.zeisel.var) #> chr [1:1000] "Plp1" "Trf" "Mal" "Apod" "Mog" "Mbp" "Car2" "Cnp" "Ugt8a" "Enpp2" "Meg3" "Mobp" "Ermn" "Ptgds" ... The choice of “n” also has a fairly straightforward biological interpretation. The main disadvantage of this approach that it turns HVG selection into a competition between genes, whereby a subset of very highly variable genes can push other informative genes out of the top set. This can be problematic for analyses of highly heterogeneous populations if the loss of important markers prevents the resolution of certain subpopulations. 11.7 Dimensionality reduction Many scRNA-seq analysis procedures involve comparing cells based on their expression values across multiple genes. For example, clustering aims to identify cells with similar transcriptomic profiles by computing Euclidean distances across genes. In these applications, each individual gene represents a dimension of the data. As the name suggests, dimensionality reduction aims to reduce the number of separate dimensions in the data. This is possible because different genes are correlated if they are affected by the same biological process. Thus, we do not need to store separate information for individual genes, but can instead compress multiple features into a single dimension, e.g., an “eigengene” (Langfelder and Horvath 2007). This reduces computational work in downstream analyses like clustering, as calculations only need to be performed for a few dimensions rather than thousands of genes; reduces noise by averaging across multiple genes to obtain a more precise representation of the patterns in the data; and enables effective plotting of the data, for those of us who are not capable of visualizing more than 3 dimensions. 11.7.1 Principal components analysis Principal components analysis (PCA) discovers axes in high-dimensional space that capture the largest amount of variation. This is best understood by imagining each axis as a line. Say we draw a line anywhere, and we move each cell in our data set onto the closest position on the line. The variance captured by this axis is defined as the variance in the positions of cells along that line. In PCA, the first axis (or “principal component”, PC) is chosen such that it maximizes this variance. The next PC is chosen such that it is orthogonal to the first and captures the greatest remaining amount of variation, and so on. By definition, the top PCs capture the dominant factors of heterogeneity in the data set. In the context of scRNA-seq, our assumption is that biological processes affect multiple genes in a coordinated manner. This means that the earlier PCs are likely to represent biological structure as more variation can be captured by considering the correlated behavior of many genes. By comparison, random technical or biological noise is expected to affect each gene independently. There is unlikely to be an axis that can capture random variation across many genes, meaning that noise should mostly be concentrated in the later PCs. This motivates the use of the earlier PCs in our downstream analyses, which concentrates the biological signal to simultaneously reduce computational work and remove noise. We can perform PCA on the log-normalized expression values using the fixedPCA() function from scran. By default, fixedPCA() will compute the first 50 PCs and store them in the reducedDims() of the output SingleCellExperiment object, as shown below. Here, we use only the top 2000 genes with the largest biological components to reduce both computational work and high-dimensional random noise. In particular, while PCA is robust to random noise, an excess of it may cause the earlier PCs to capture noise instead of biological structure (Johnstone and Lu 2009). library("scran") ## Top 2000 HVGs top.zeisel <- getTopHVGs(dec.zeisel, n = 2000) ## Principal component analysis using top 2000 HVGs, 50 PCs set.seed(100) sce.zeisel <- fixedPCA(sce.zeisel, subset.row = top.zeisel) reducedDimNames(sce.zeisel) #> [1] "PCA" 11.7.2 Choosing the number of PCs How many of the top PCs should we retain for downstream analyses? The choice of the number of PCs is an analogous decision to the choice of the number of HVGs to use. Using more PCs will retain more biological signal at the cost of including more noise that might mask said signal. On the other hand, using fewer PCs will introduce competition between different factors of variation, where weaker (but still interesting) factors may be pushed down into lower PCs and inadvertently discarded from downstream analyses. It is hard to determine whether an “optimal” choice exists for the number of PCs. Certainly, we could attempt to remove the technical variation that is almost always uninteresting. However, even if we were only left with biological variation, there is no straightforward way to automatically determine which aspects of this variation are relevant. Most practitioners will simply set to a “reasonable” but arbitrary value, typically ranging from 10 to 50. This is satisfactory depending of the amount of variance explained by that certain number of PCs. ## Variance explained by PCs percent.var <- attr(reducedDim(sce.zeisel), "percentVar") plot(percent.var, log = "y", xlab = "PC", ylab = "Variance explained (%)") 11.7.3 Visualizing the PCs Algorithms are more than happy to operate on 10-50 PCs, but these are still too many dimensions for human comprehension. To visualize the data, the most common and easy way is to use the top 2 PCs for plotting. library("scater") ## Plot PCA (Top 2 PCs for 2 dimentional visualization) plotReducedDim(sce.zeisel, dimred = "PCA", colour_by = "level1class") The problem is that PCA is a linear technique, i.e., only variation along a line in high-dimensional space is captured by each PC. As such, it cannot efficiently pack differences in d dimensions into the first 2 PCs. One workaround is to plot several of the top PCs against each other in pairwise plots. However, it is difficult to interpret multiple plots simultaneously, and even this approach is not sufficient to separate some of the annotated subpopulations. ## plot top 4 PCs against each other in pairwise plots plotReducedDim(sce.zeisel, dimred = "PCA", ncomponents = 4, colour_by = "level1class") 11.7.4 Non-linear methods for visualization 11.7.4.1 t-stochastic neighbor embedding The de facto standard for visualization of scRNA-seq data is the t-stochastic neighbor embedding (TSNE) method (Van der Maaten and Hinton 2008). This attempts to find a low-dimensional representation of the data that preserves the distances between each point and its neighbors in the high-dimensional space. Unlike PCA, it is not restricted to linear transformations, nor is it obliged to accurately represent distances between distant populations. This means that it has much more freedom in how it arranges cells in low-dimensional space, enabling it to separate many distinct clusters in a complex population ## TSNE using runTSNE() stores the t-SNE coordinates in the reducedDims set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA") ## Plot TSNE plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class") The “perplexity” is another important parameter that determines the granularity of the visualization. Low perplexities will favor resolution of finer structure, possibly to the point that the visualization is compromised by random noise. Thus, it is advisable to test different perplexity values to ensure that the choice of perplexity does not drive the interpretation of the plot. ## run TSNE using diferent perplexity numbers and plot ## TSNE using perplexity = 5 set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA", perplexity = 5) out5 <- plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class" ) + ggtitle("perplexity = 5") ## TSNE using perplexity = 20 set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA", perplexity = 20) out20 <- plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class" ) + ggtitle("perplexity = 20") ## TSNE using perplexity = 80 set.seed(100) sce.zeisel <- runTSNE(sce.zeisel, dimred = "PCA", perplexity = 80) out80 <- plotReducedDim(sce.zeisel, dimred = "TSNE", colour_by = "level1class" ) + ggtitle("perplexity = 80") ## Combine plots gridExtra::grid.arrange(out5, out20, out80, ncol = 3) 11.7.4.2 Uniform manifold approximation and projection The uniform manifold approximation and projection (UMAP) method (McInnes, Healy, and Melville 2018) is an alternative to TSNE for non-linear dimensionality reduction. It is roughly similar to tSNE in that it also tries to find a low-dimensional representation that preserves relationships between neighbors in high-dimensional space. However, the two methods are based on different theory, represented by differences in the various graph weighting equations. This manifests as a different visualization. ## UMAP using runUMAP() stores the coordinates in the reducedDims set.seed(100) sce.zeisel <- runUMAP(sce.zeisel, dimred = "PCA") ## Plot UMAP plotReducedDim(sce.zeisel, dimred = "UMAP", colour_by = "level1class") Compared to tSNE, the UMAP visualization tends to have more compact visual clusters with more empty space between them. It also attempts to preserve more of the global structure than tSNE. From a practical perspective, UMAP is much faster than tSNE, which may be an important consideration for large datasets. UMAP also involves a series of randomization steps so setting the seed is critical. It is arguable whether the UMAP or tSNE visualizations are more useful or aesthetically pleasing. UMAP aims to preserve more global structure but this necessarily reduces resolution within each visual cluster. However, UMAP is unarguably much faster, and for that reason alone, it is increasingly displacing TSNE as the method of choice for visualizing large scRNA-seq data sets. 11.8 Clustering Clustering is an unsupervised learning procedure that is used to empirically define groups of cells with similar expression profiles. Its primary purpose is to summarize complex scRNA-seq data into a digestible format for human interpretation. This allows us to describe population heterogeneity in terms of discrete labels that are easily understood, rather than attempting to comprehend the high-dimensional manifold on which the cells truly reside. After annotation based on marker genes, the clusters can be treated as proxies for more abstract biological concepts such as cell types or cell states. At this point, it is helpful to realize that clustering, like a microscope, is simply a tool to explore the data. We can zoom in and out by changing the resolution of the clustering parameters, and we can experiment with different clustering algorithms to obtain alternative perspectives of the data. This iterative approach is entirely permissible given that data exploration constitutes the majority of the scRNA-seq data analysis workflow. As such, questions about the “correctness” of the clusters or the “true” number of clusters are usually meaningless. We can define as many clusters as we like, with whatever algorithm we like. Each clustering will represent its own partitioning of the high-dimensional expression space, and is as “real” as any other clustering. A more relevant question is “how well do the clusters approximate the cell types or states of interest?” Unfortunately, this is difficult to answer given the context-dependent interpretation of the underlying biology. Some analysts will be satisfied with resolution of the major cell types; other analysts may want resolution of subtypes; and others still may require resolution of different states (e.g., metabolic activity, stress) within those subtypes. Regardless of the exact method used, clustering is a critical step for extracting biological insights from scRNA-seq data. 11.8.1 Graph-based clustering Graph-based clustering is a flexible and scalable technique for clustering large scRNA-seq datasets. We first build a graph where each node is a cell that is connected to its nearest neighbors in the high-dimensional space. Edges are weighted based on the similarity between the cells involved, with higher weight given to cells that are more closely related. We then apply algorithms to identify “communities” of cells that are more connected to cells in the same community than they are to cells of different communities. Each community represents a cluster that we can use for downstream interpretation. The major advantage of graph-based clustering lies in its scalability. It only requires a k-nearest neighbor search that can be done in log-linear time on average, in contrast to hierachical clustering methods with runtimes that are quadratic with respect to the number of cells. Graph construction avoids making strong assumptions about the shape of the clusters or the distribution of cells within each cluster, compared to other methods like k-means (that favor spherical clusters) or Gaussian mixture models (that require normality). The main drawback of graph-based methods is that, after graph construction, no information is retained about relationships beyond the neighboring cells. To demonstrate, we use the clusterCells() function in scran on PBMC dataset. All calculations are performed using the top PCs to take advantage of data compression and denoising. This function returns a vector containing cluster assignments for each cell in our SingleCellExperiment object. By default, clusterCells() uses the 10 nearest neighbors of each cell to construct a shared nearest neighbor graph. Two cells are connected by an edge if any of their nearest neighbors are shared, with the edge weight defined from the highest average rank of the shared neighbors (Xu and Su 2015). The Walktrap method from the igraph package is then used to identify communities. library("scran") ## Cluster using "scran::clusterCells" nn.clusters <- clusterCells(sce.zeisel, use.dimred = "PCA") ## Cluster assignments table(nn.clusters) #> nn.clusters #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 #> 561 136 78 159 123 65 112 349 368 105 95 200 92 44 67 58 37 28 33 28 48 30 We assign the cluster assignments back into our SingleCellExperiment object as a factor in the column metadata. This allows us to conveniently visualize the distribution of clusters in a tSNE plot: ## Save the cluster assignments colLabels(sce.zeisel) <- nn.clusters ## Plot TSNE coloured by cluster assignments plotReducedDim(sce.zeisel, "TSNE", colour_by = "label") If we want to explicitly specify all of these parameters, we would use the more verbose call below. This uses a SNNGraphParam object from the bluster package to instruct clusterCells() to detect communities from a shared nearest-neighbor graph with the specified parameters. The appeal of this interface is that it allows us to easily switch to a different clustering algorithm by simply changing the BLUSPARAM argument. library(bluster) ## Clustering using k=10 nn.clusters2 <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = SNNGraphParam(k = 10, type = "rank", cluster.fun = "walktrap") ) table(nn.clusters2) #> nn.clusters2 #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 #> 561 136 78 159 123 65 112 349 368 105 95 200 92 44 67 58 37 28 33 28 48 30 We could also obtain the graph itself by specifying full=TRUE in the clusterCells() call. Doing so will return all intermediate structures that are used during clustering, including a graph object from the igraph package. ## Obtain the graph nn.clust.info <- clusterCells(sce.zeisel, use.dimred = "PCA", full = TRUE) head(nn.clust.info$objects$graph) #> 6 x 2816 sparse Matrix of class "dgCMatrix" #> #> [1,] . 8.5 9.5 9.5 9 8.5 8 5.5 8 6 4 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ...... #> #> .............................. #> ........suppressing 2762 columns and 4 rows in show(); maybe adjust options(max.print=, width=) #> .............................. #> #> [6,] 8.5 8 5.5 9 9 . 8.5 9 8 6 5 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ...... 11.8.2 Adjusting the parameters A graph-based clustering method has several key parameters: How many neighbors are considered when constructing the graph. What scheme is used to weight the edges. Which community detection algorithm is used to define the clusters. K Neighbors One of the most important parameters is k, the number of nearest neighbors used to construct the graph. This controls the resolution of the clustering where higher k yields a more inter-connected graph and broader clusters. Users can exploit this by experimenting with different values of k to obtain a satisfactory resolution. ## More resolved clustering using a smaller k (k=5) clust.5 <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(k = 5)) table(clust.5) #> clust.5 #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 #> 118 98 336 85 36 115 49 85 429 295 40 37 97 56 45 43 77 159 28 40 52 33 24 51 28 89 36 51 65 8 #> 31 32 33 34 35 36 37 #> 15 17 20 14 9 9 27 ## Less resolved clustering using a larger k (k=50) clust.50 <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(k = 50)) table(clust.50) #> clust.50 #> 1 2 3 4 5 #> 362 812 945 288 409 ## Plot TSNE coloured by cluster assignments again, now with clust.50 results colLabels(sce.zeisel) <- clust.50 plotReducedDim(sce.zeisel, "TSNE", colour_by = "label") Edge weighting scheme Further tweaking can be performed by changing the edge weighting scheme during graph construction. Setting type = \"number\" will weight edges based on the number of nearest neighbors that are shared between two cells. Similarly, type = \"jaccard\" will weight edges according to the Jaccard index of the two sets of neighbors. We can also disable weighting altogether by using a simple k-nearest neighbor graph, which is occasionally useful for downstream graph operations that do not support weights. ## Cluster using the number of shared nearest neighbors (type="number") clust.num <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(type = "number") ) table(clust.num) #> clust.num #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 #> 128 161 129 457 128 116 78 309 397 205 60 96 70 62 35 13 46 51 30 31 52 28 15 58 34 27 ## Cluster using the Jaccard index (similarity between sample sets) clust.jaccard <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(type = "jaccard") ) table(clust.jaccard) #> clust.jaccard #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 #> 131 166 195 129 294 128 113 77 332 200 375 61 97 71 84 32 13 46 53 30 52 28 31 36 15 27 ## Cluster without specifying a graph type (default method-KNNGraphParam) clust.none <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = KNNGraphParam() ) table(clust.none) #> clust.none #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 #> 77 454 297 132 105 164 129 104 62 533 186 45 105 33 69 82 50 52 31 34 30 15 27 Community detection The community detection can be performed by using any of the algorithms provided by igraph. The Walktrap approach is a common one, but many others are available to choose from: clust.walktrap <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "walktrap") ) clust.louvain <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "louvain") ) clust.infomap <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "infomap") ) clust.fast <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "fast_greedy") ) clust.labprop <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "label_prop") ) clust.eigen <- clusterCells(sce.zeisel, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "leading_eigen") ) 11.8.3 Hierarchical clustering Hierarchical clustering is an old technique that arranges samples into a hierarchy based on their relative similarity to each other. Most implementations do so by joining the most similar samples into a new cluster, then joining similar clusters into larger clusters, and so on, until all samples belong to a single cluster. This process yields obtain a dendrogram that defines clusters with progressively increasing granularity. Variants of hierarchical clustering methods primarily differ in how they choose to perform the agglomerations. For example, complete linkage aims to merge clusters with the smallest maximum distance between their elements, while Ward’s method aims to minimize the increase in within-cluster variance. In the context of scRNA-seq, the main advantage of hierarchical clustering lies in the production of the dendrogram. This is a rich summary that quantitatively captures the relationships between subpopulations at various resolutions.This can be helpful for interpretation. In practice, hierarchical clustering is too slow to be used for anything but the smallest scRNA-seq datasets. Most implementations require a cell-cell distance matrix that is prohibitively expensive to compute for a large number of cells. Greedy agglomeration is also likely to result in a quantitatively suboptimal partitioning (as defined by the agglomeration measure) at higher levels of the dendrogram when the number of cells and merge steps is high We use a HclustParam object to instruct clusterCells() to perform hierarchical clustering on the top PCs. Specifically, it computes a cell-cell distance matrix using the top PCs and then applies Ward’s minimum variance method to obtain a dendrogram. For this case, we will use the sce.416b library("scran") ## Top 2000 HVGs top.416b <- getTopHVGs(sce.416b, n = 2000) ## Principal component analysis using top 2000 HVGs, 50 PCs set.seed(100) sce.416b <- fixedPCA(sce.416b, subset.row = top.416b) ## TSNE sce.416b <- runTSNE(sce.416b, dimred = "PCA") library("dendextend") #> #> --------------------- #> Welcome to dendextend version 1.17.1 #> Type citation('dendextend') for how to cite the package. #> #> Type browseVignettes(package = 'dendextend') for the package vignette. #> The github page is: https://github.com/talgalili/dendextend/ #> #> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues #> You may ask questions at stackoverflow, use the r and dendextend tags: #> https://stackoverflow.com/questions/tagged/dendextend #> #> To suppress this message use: suppressPackageStartupMessages(library(dendextend)) #> --------------------- #> #> Attaching package: 'dendextend' #> The following object is masked from 'package:stats': #> #> cutree ## Perform hierarchical clustering on the PCA-reduced data from sce.416b ## The BLUSPARAM argument specifies the clustering method (here "ward.D2"). ## The full=TRUE argument ensures that additional objects related to clustering are returned. hclust.416b <- clusterCells(sce.416b, use.dimred = "PCA", BLUSPARAM = HclustParam(method = "ward.D2"), full = TRUE ) ## Extract the hierarchical clustering tree from the clustering result tree.416b <- hclust.416b$objects$hclust ## Customize the dendrogram for better visualization tree.416b$labels <- seq_along(tree.416b$labels) ## Convert the hierarchical clustering tree to a dendrogram object dend <- as.dendrogram(tree.416b, hang = 0.1) combined.fac <- paste0( sce.416b$block, ".", sub(" .*", "", sce.416b$phenotype) ) labels_colors(dend) <- c( "20160113.wild" = "blue", "20160113.induced" = "red", "20160325.wild" = "dodgerblue", "20160325.induced" = "salmon" )[combined.fac][order.dendrogram(dend)] ## Plot the dendrogram plot(dend) To obtain explicit clusters, we “cut” the tree by removing internal branches such that every subtree represents a distinct cluster. This is most simply done by removing internal branches above a certain height of the tree, as performed by the cutree() function. A more sophisticated variant of this approach is implemented in the dynamicTreeCut package, which uses the shape of the branches to obtain a better partitioning for complex dendrograms. We enable this option by setting cut.dynamic = TRUE, with additional tweaking of the deepSplit parameter to control the resolution of the resulting clusters. library("dynamicTreeCut") ## Perform hierarchical clustering with dynamic tree cut on the PCA ## The BLUSPARAM argument specifies the clustering method (here "ward.D2"), ## and enables dynamic tree cut (cut.dynamic=TRUE) with specific parameters. hclust.dyn <- clusterCells(sce.416b, use.dimred = "PCA", BLUSPARAM = HclustParam( method = "ward.D2", cut.dynamic = TRUE, cut.params = list(minClusterSize = 10, deepSplit = 1) ) ) table(hclust.dyn) #> hclust.dyn #> 1 2 3 4 #> 82 70 27 13 ## Plot dendogram labels_colors(dend) <- as.integer(hclust.dyn)[order.dendrogram(dend)] plot(dend) ## Obtain assignations and plot TSNE colLabels(sce.416b) <- factor(hclust.dyn) plotReducedDim(sce.416b, "TSNE", colour_by = "label") 11.8.4 Subclustering Another simple approach to improving resolution is to repeat the feature selection and clustering within a single cluster. This aims to select HVGs and PCs that are more relevant to internal structure, improving resolution by avoiding noise from unnecessary features. Subsetting also encourages clustering methods to separate cells according to more modest heterogeneity in the absence of distinct subpopulations. 11.9 Marker gene detection To interpret our clustering results, we need to identify the genes that drive separation between clusters. These marker genes allow us to assign biological meaning to each cluster based on their functional annotation. In the simplest case, we have a priori knowledge of the marker genes associated with particular cell types, allowing us to treat the clustering as a proxy for cell type identity. The same principle can be applied to discover more subtle differences between clusters (e.g., changes in activation or differentiation state) based on the behavior of genes in the affected pathways. The most straightforward approach to marker gene detection involves testing for differential expression between clusters. If a gene is strongly DE between clusters, it is likely to have driven the separation of cells in the clustering algorithm. Several methods are available to quantify the differences in expression profiles between clusters and obtain a single ranking of genes for each cluster. 11.9.1 Scoring markers by pairwise comparisons Our general strategy is to compare each pair of clusters and compute scores quantifying the differences in the expression distributions between clusters. The scores for all pairwise comparisons involving a particular cluster are then consolidated into a single DataFrame for that cluster. The scoreMarkers() function from scran returns a list of DataFrames, where each DataFrame corresponds to a cluster and each row of the DataFrame corresponds to a gene. In the DataFrame for cluster “X”, the columns contain the “self.average”: the mean log-expression in “X” “other.average”: the grand mean across all other clusters self.detected: the proportion of cells with detected expression in “X” other.detected: the mean detected proportion across all other clusters a variety of effect size summaries generated from all pairwise comparisons involving “X” library("scran") ## Scoring markers by pairwise comparisons marker.info <- scoreMarkers(sce.zeisel, colLabels(sce.zeisel)) marker.info #> List of length 5 #> names(5): 1 2 3 4 5 ## Statistics for cluster 1 colnames(marker.info[["1"]]) #> [1] "self.average" "other.average" "self.detected" "other.detected" #> [5] "mean.logFC.cohen" "min.logFC.cohen" "median.logFC.cohen" "max.logFC.cohen" #> [9] "rank.logFC.cohen" "mean.AUC" "min.AUC" "median.AUC" #> [13] "max.AUC" "rank.AUC" "mean.logFC.detected" "min.logFC.detected" #> [17] "median.logFC.detected" "max.logFC.detected" "rank.logFC.detected" For each cluster, we can then rank candidate markers based on one of these effect size summaries ## Subset to the first cluster chosen <- marker.info[["1"]] ## Rank candidate markers based on one of these effect size summaries ordered <- chosen[order(chosen$mean.AUC, decreasing=TRUE),] head(ordered[,1:4]) #> DataFrame with 6 rows and 4 columns #> self.average other.average self.detected other.detected #> <numeric> <numeric> <numeric> <numeric> #> Cst3 6.16324 2.721622 0.977901 0.895103 #> Sepp1 3.88488 0.830472 0.900552 0.297665 #> B2m 3.09863 0.820891 0.820442 0.468411 #> Sparcl1 5.54095 2.882289 0.930939 0.845173 #> Zfp36l1 2.48297 0.261565 0.723757 0.151524 #> Gng5 2.40613 0.603069 0.812155 0.320553 library("scater") ## Plot the marker gene expression by label plotExpression(sce.zeisel, features=head(rownames(ordered)), x="label", colour_by="label") # Distribution of expression values across clusters for the top potential # marker genes (as determined by the mean AUC) for cluster 1 Here, we deliberately use pairwise comparisons rather than comparing each cluster to the average of all other cells. The latter approach is sensitive to the population composition, which introduces an element of unpredictability to the marker sets due to variation in cell type abundances. In the worst case, the presence of one sub-population containing a majority of the cells will drive the selection of top markers for every other cluster, pushing out useful genes that can distinguish between the smaller sub-populations. 11.9.2 Effect sizes for pairwise comparisons The AUC or Cohen’s d is usually the best choice for general purpose marker detection, as they are effective regardless of the magnitude of the expression values. The log-fold change in the detected proportion is specifically useful for identifying binary changes in expression. AUC In the context of marker detection, the area under the curve (AUC) quantifies our ability to distinguish between two distributions in a pairwise comparison. The AUC represents the probability that a randomly chosen observation from our cluster of interest is greater than a randomly chosen observation from the other cluster. A value of 1 corresponds to upregulation, where all values of our cluster of interest are greater than any value from the other cluster A value of 0.5 means that there is no net difference in the location of the distributions A value of 0 corresponds to downregulation he AUC is closely related to the U-statistic in the Wilcoxon ranked sum test (a.k.a., Mann-Whitney U-test). ## Subset the AUC from the candidate markers of cluster 1 info ## and rank (by AUC) auc.only <- chosen[,grepl("AUC", colnames(chosen))] auc.only[order(auc.only$mean.AUC,decreasing=TRUE),] #> DataFrame with 19839 rows and 5 columns #> mean.AUC min.AUC median.AUC max.AUC rank.AUC #> <numeric> <numeric> <numeric> <numeric> <integer> #> Cst3 0.895187 0.858550 0.900951 0.920298 1 #> Sepp1 0.873234 0.723951 0.918144 0.932695 1 #> B2m 0.830457 0.794594 0.826349 0.874535 3 #> Sparcl1 0.829280 0.799851 0.816875 0.883517 2 #> Zfp36l1 0.828530 0.810392 0.827840 0.848047 3 #> ... ... ... ... ... ... #> Hsp90aa1 0.1054120 0.0623657 0.0905663 0.1781496 19655 #> Scg5 0.1033219 0.0193178 0.0312032 0.3315632 19249 #> Snurf 0.0876293 0.0140424 0.0358926 0.2646899 19529 #> [ reached getOption("max.print") -- omitted 2 rows ] Cohen’s d Cohen’s d is a standardized log-fold change where the difference in the mean log-expression between groups is scaled by the average standard deviation across groups. In other words, it is the number of standard deviations that separate the means of the two groups. The interpretation is similar to the log-fold change: Positive values indicate that the gene is upregulated in our cluster of interest Negative values indicate downregulation values close to zero indicate that there is little difference. Cohen’s d is roughly analogous to the t-statistic in various two-sample t-tests. ## Subset the "logFC.cohen" from the candidate markers of cluster 1 info ## and rank (by Cohen’s d) cohen.only <- chosen[,grepl("logFC.cohen", colnames(chosen))] cohen.only[order(cohen.only$mean.logFC.cohen,decreasing=TRUE),] #> DataFrame with 19839 rows and 5 columns #> mean.logFC.cohen min.logFC.cohen median.logFC.cohen max.logFC.cohen rank.logFC.cohen #> <numeric> <numeric> <numeric> <numeric> <integer> #> Sepp1 2.00527 0.887739 2.29849 2.53635 1 #> Cst3 1.77874 1.501392 1.82252 1.96852 1 #> Gng5 1.66122 0.538738 1.94492 2.21630 2 #> Zfp36l1 1.63278 1.459939 1.63000 1.81116 3 #> Apoe 1.58840 1.309029 1.61529 1.81402 5 #> ... ... ... ... ... ... #> Rab3a -2.80064 -3.94223 -3.35880 -0.542752 18990 #> Mllt11 -2.83899 -4.53504 -3.18120 -0.458515 18737 #> Acot7 -2.97134 -3.41241 -2.85837 -2.756219 19766 #> [ reached getOption("max.print") -- omitted 2 rows ] log-fold change Finally, we also compute the log-fold change in the proportion of cells with detected expression between clusters. This ignores any information about the magnitude of expression, only considering whether any expression is detected at all. Again, positive values indicate that a greater proportion of cells express the gene in our cluster of interest compared to the other cluster. Note that a pseudo-count is added to avoid undefined log-fold changes when no cells express the gene in either group. ## Subset the "logFC.detected" from the candidate markers of cluster 1 info ## and rank (by log-fold change) detect.only <- chosen[,grepl("logFC.detected", colnames(chosen))] detect.only[order(detect.only$mean.logFC.detected,decreasing=TRUE),] #> DataFrame with 19839 rows and 5 columns #> mean.logFC.detected min.logFC.detected median.logFC.detected max.logFC.detected rank.logFC.detected #> <numeric> <numeric> <numeric> <numeric> <integer> #> Hhex 3.88701 3.18636 3.73608 4.88952 2 #> Ly6f 3.80886 2.62872 3.98527 4.63618 3 #> 9030619P08Rik 3.74819 2.62200 3.76970 4.83137 1 #> Casp8 3.69939 2.49673 3.34698 5.60688 3 #> Kcne1l 3.60801 2.17477 3.75120 4.75489 1 #> ... ... ... ... ... ... #> Slc35f4 -4.98323 -5.85599 -5.40401 -3.26891 19695 #> 2900079G21Rik -5.03133 -5.99641 -5.44274 -3.24343 19688 #> D630023F18Rik -5.16180 -6.71692 -5.63179 -2.66670 19516 #> [ reached getOption("max.print") -- omitted 2 rows ] The AUC or Cohen’s d is usually the best choice for general purpose marker detection, as they are effective regardless of the magnitude of the expression values. The log-fold change in the detected proportion is specifically useful for identifying binary changes in expression. 11.9.3 Summaries of pairwise effects In a dataset with “N” clusters, each cluster is associated with “N” − 1 values for each type of effect size described. To simplify interpretation, we summarize the effects for each cluster into some key statistics such as the mean and median. Each summary statistic has a different interpretation when used for ranking: mean: The most obvious summary statistic is the mean. For cluster “X”, a large mean effect size (>0 for the log-fold changes, >0.5 for the AUCs) indicates that the gene is upregulated in “X” compared to the average of the other groups. median: a large value indicates that the gene is upregulated in “X” compared to most (>50%) other clusters. The median provides greater robustness to outliers than the mean, which may or may not be desirable. On one hand, the median avoids an inflated effect size if only a minority of comparisons have large effects; on the other hand, it will also overstate the effect size by ignoring a minority of comparisons that have opposing effects. minimum value: The minimum value (min.*) is the most stringent summary for identifying upregulated genes, as a large value indicates that the gene is upregulated in “X” compared to all other clusters. Conversely, if the minimum is small (<0 for the log-fold changes, <0.5 for the AUCs), we can conclude that the gene is downregulated in “X” compared to at least one other cluster. maximum value: The maximum value (max.*) is the least stringent summary for identifying upregulated genes, as a large value can be obtained if there is strong upregulation in “X” compared to any other cluster. Conversely, if the maximum is small, we can conclude that the gene is downregulated in “X” compared to all other clusters. minimum rank: The minimum rank, a.k.a., “min-rank” (rank.*) is the smallest rank of each gene across all pairwise comparisons. Specifically, genes are ranked within each pairwise comparison based on decreasing effect size, and then the smallest rank across all comparisons is reported for each gene. If a gene has a small min-rank, we can conclude that it is one of the top upregulated genes in at least one comparison of “X” to another cluster. Each of these summaries is computed for each effect size, for each gene, and for each cluster. Our next step is to choose one of these summary statistics for one of the effect sizes and to use it to rank the rows of the DataFrame. For identifying upregulated genes, ranking by the minimum is the most stringent and the maximum is the least stringent; the mean and median fall somewhere in between and are reasonable defaults for most applications. 11.9.4 Choose top markers To continue or example, we will use the the median Cohen’s d to obtain a ranking of the marker genes Now that we have them ranked, we can choose how many of them are interesting to us. For this example, wi will stay with only the set of markers in which Cohen’s d derived min-ranks is less than or equal to 5. ## Order the candidate markers by "rank.logFC.cohen" for each cluster ordered <- chosen[order(chosen$rank.logFC.cohen),] ## Choose the top marker genes for each cluster top.ranked <- ordered[ordered$rank.logFC.cohen <= 10,] rownames(top.ranked) # Gene names #> [1] "Cst3" "Sepp1" "Gng5" "Sparcl1" "B2m" "Zfp36l1" "Atp1a2" "Qk" "Apoe" "Id3" #> [11] "Sat1" "Sparc" "Cd63" "Epas1" "Slco1c1" "Glul" "Gstm1" "Mt1" "Serpine2" We can also plot the expression in a Heat Map: ## Plot a heatmap for the expression of some top marker genes for each cluster plotGroupedHeatmap(sce.zeisel, features=rownames(top.ranked), group="label", center=TRUE, zlim=c(-3, 3)) ### Using a log-fold change threshold The Cohen’s d and AUC calculations consider both the magnitude of the difference between clusters as well as the variability within each cluster. If the variability is lower, it is possible for a gene to have a large effect size even if the magnitude of the difference is small. These genes tend to be somewhat uninformative for cell type identification despite their strong differential expression (e.g., ribosomal protein genes). We would prefer genes with larger log-fold changes between clusters, even if they have higher variability. To favor the detection of such genes, we can compute the effect sizes relative to a log-fold change threshold by setting lfc= in scoreMarkers(). ## Scoring markers by pairwise comparisons (effect sizes relative to a log-fold change) marker.info.lfc <- scoreMarkers(sce.zeisel, colLabels(sce.zeisel), lfc=2) ## Statistics for cluster 1 chosen2 <- marker.info.lfc[["1"]] ## Rank info from cluster 1 by mean.AUC chosen2 <- chosen2[order(chosen2$mean.AUC, decreasing=TRUE),] chosen2[,c("self.average", "other.average", "mean.AUC")] # Check "self.average", "other.average", "mean.AUC" #> DataFrame with 19839 rows and 3 columns #> self.average other.average mean.AUC #> <numeric> <numeric> <numeric> #> Sepp1 3.88488 0.830472 0.716022 #> Cst3 6.16324 2.721622 0.706665 #> Atp1a2 4.54695 0.995355 0.680076 #> Apoe 4.44418 0.619766 0.667839 #> Sparc 3.71278 0.625208 0.642240 #> ... ... ... ... #> Zscan4b 0.00000000 0.000835435 0 #> Zscan4e 0.00000000 0.000977365 0 #> Zscan4f 0.00000000 0.001098376 0 #> Zswim5 0.01622302 0.059746058 0 #> Zyg11a 0.00451334 0.001709500 0 We can also create something a little bit different. Here we have a dot plot of the top potential marker genes (as determined by the mean AUC) for cluster 1. Each row corrresponds to a marker gene and each column corresponds to a cluster. The size of each dot represents the proportion of cells with detected expression of the gene in the cluster. The color is proportional to the average expression across all cells in that cluster. ## Dot plot for the potential top markers for cluster 1 plotDots(sce.zeisel, rownames(chosen2)[1:10], group="label") 11.9.5 Handling blocking factors Some studies may contain factors of variation that are known and not interesting (e.g., batch effects, sex differences). If these are not modelled, they can interfere with marker gene detection, most obviously by inflating the variance within each cluster, but also by distorting the log-fold changes if the cluster composition varies across levels of the blocking factor. To avoid these issues, we specify the blocking factor via the block= argument ## Scoring markers by pairwise comparisons using a block factor (tissue) m.out <- scoreMarkers(sce.zeisel, colLabels(sce.zeisel), block=sce.zeisel$tissue) For each gene, each pairwise comparison between clusters is performed separately in each level of the blocking factor - in this case, the plate of origin. By comparing within each batch, we cancel out any batch effects so that they are not conflated with the biological differences between subpopulations. The effect sizes are then averaged across batches to obtain a single value per comparison, using a weighted mean that accounts for the number of cells involved in the comparison in each batch. A similar correction is applied to the mean log-expression and proportion of detected cells inside and outside each cluster. ## Subset the info for cluster 1 demo <- m.out[["1"]] ## Order by the log-expression (which had a correction using block=sex) ordered <- demo[order(demo$median.logFC.cohen, decreasing=TRUE),] ordered[,1:4] #> DataFrame with 19839 rows and 4 columns #> self.average other.average self.detected other.detected #> <numeric> <numeric> <numeric> <numeric> #> Sepp1 3.50217 0.528293 0.825712 0.209078 #> Gng5 2.37150 0.575725 0.800459 0.311615 #> Cst3 6.45770 2.954119 0.981201 0.904447 #> Apoe 4.66651 0.795313 0.704185 0.186789 #> Zfp36l1 2.49179 0.268529 0.735299 0.156745 #> ... ... ... ... ... #> Syp 0.238229 2.20762 0.148603 0.814644 #> Rab3a 0.475216 2.62944 0.219682 0.878967 #> Snap25 1.229693 4.40115 0.433556 0.882369 #> Stmn3 0.614814 3.75383 0.204414 0.840670 #> Ndrg4 0.664605 3.23414 0.240788 0.825395 We can also plot our top marker genes expression now coloured by the block factor we used, in this case “tissue”. ## In case we don´t have them as factors for the coloring sce.zeisel$tissue <- as.factor(sce.zeisel$tissue) ## Plot the top marker genes expression by tissue plotExpression(sce.zeisel, features=rownames(ordered)[1:6], x="label", colour_by="tissue") The block= argument works for all effect sizes shown above and is robust to differences in the log-fold changes or variance between batches. However, it assumes that each pair of clusters is present in at least one batch. In scenarios where cells from two clusters never co-occur in the same batch, the associated pairwise comparison will be impossible and is ignored during calculation of summary statistics. 11.9.6 Deconvobuddies if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } BiocManager::install("DeconvoBuddies") findMarkers_1vAll Calculate 1 vs. All standard fold change for each gene x cell type, wrapper function for scran::findMarkers https://research.libd.org/DeconvoBuddies/reference/findMarkers_1vAll.html get_mean_ratio2 Calculate the mean ratio value and rank for each gene for each cell type in the sce object, to identify effective marker genes https://research.libd.org/DeconvoBuddies/reference/get_mean_ratio2.html 11.10 Cell type annotation The most challenging task in scRNA-seq data analysis is arguably the interpretation of the results. Obtaining clusters of cells is fairly straightforward, but it is more difficult to determine what biological state is represented by each of those clusters. Doing so requires us to bridge the gap between the current dataset and prior biological knowledge, and the latter is not always available in a consistent and quantitative manner. Even the concept of a “cell type” is not clearly defined :( Interpretation of scRNA-seq data is often manual and a common bottleneck in the analysis workflow. To expedite this step, we can try to use various computational approaches that exploit prior information to assign meaning to an uncharacterized scRNA-seq dataset. The most obvious sources of prior information are the curated gene sets associated with particular biological processes, e.g., from the Gene Ontology (GO) or the Kyoto Encyclopedia of Genes and Genomes (KEGG) collections. Alternatively, we could directly compare our expression profiles to published reference datasets where each sample or cell has already been annotated with its putative biological state by domain experts. It is important to have in mind that this step will largely depend on the amount of previous biological knowledge for your specific data set. The most obvious sources of prior information are the curated gene sets associated with particular biological processes, e.g., from the Gene Ontology (GO) or the Kyoto Encyclopedia of Genes and Genomes (KEGG) collections. Alternatively, we can directly compare our expression profiles to published reference datasets where each sample or cell has already been annotated with its putative biological state by domain experts. 11.10.1 Assigning cell labels from reference data A conceptually straightforward annotation approach is to compare the single-cell expression profiles with previously annotated reference datasets. Labels can then be assigned to each cell in our uncharacterized test dataset based on the most similar reference sample(s). This is a standard classification challenge that can be tackled by standard machine learning techniques such as random forests and support vector machines. Any published and labelled RNA-seq dataset (bulk or single-cell) can be used as a reference SingleR method The SingleR method (Aran et al. 2019) for cell type annotation assigns labels to cells based on the reference samples with the highest Spearman rank correlations, using only the marker genes between pairs of labels to focus on the relevant differences between cell types. It also performs a fine-tuning step for each cell where the correlations are recomputed with just the marker genes for the top-scoring labels. This aims to resolve any ambiguity between those labels by removing noise from irrelevant markers for other labels. Further details can be found in the SingleR book. 11.11 Getting ready again We will use now one of the 10X PBMC datasets as our test. While will apply quality control, normalization and clustering for this dataset, altough this is not strictly necessary. It is entirely possible to run SingleR() on the raw counts without any a priori quality control and filter on the annotation results at one’s leisure ## Load data library("DropletTestFiles") raw.path <- getTestFile("tenx-2.1.0-pbmc4k/1.0.0/raw.tar.gz") #> see ?DropletTestFiles and browseVignettes('DropletTestFiles') for documentation #> downloading 1 resources #> retrieving 1 resource #> loading from cache out.path <- file.path(tempdir(), "pbmc4k") untar(raw.path, exdir=out.path) library("DropletUtils") fname <- file.path(out.path, "raw_gene_bc_matrices/GRCh38") sce.pbmc <- read10xCounts(fname, col.names=TRUE) library("scater") rownames(sce.pbmc) <- uniquifyFeatureNames( rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol) library("EnsDb.Hsapiens.v86") location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID, column="SEQNAME", keytype="GENEID") #> Warning: Unable to map 144 of 33694 requested IDs. ### QC set.seed(100) e.out <- emptyDrops(counts(sce.pbmc)) sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)] unfiltered <- sce.pbmc stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=which(location=="MT"))) high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher") sce.pbmc <- sce.pbmc[,!high.mito] summary(high.mito) #> Mode FALSE TRUE #> logical 3985 315 ### Normalization library("scran") set.seed(1000) clusters <- quickCluster(sce.pbmc) sce.pbmc <- computeSumFactors(sce.pbmc, cluster=clusters) sce.pbmc <- logNormCounts(sce.pbmc) summary(sizeFactors(sce.pbmc)) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.00749 0.71207 0.87490 1.00000 1.09900 12.25412 ### Variance modelling set.seed(1001) dec.pbmc <- modelGeneVarByPoisson(sce.pbmc) top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1) ### Dimensionality reduction set.seed(10000) sce.pbmc <- denoisePCA(sce.pbmc, subset.row=top.pbmc, technical=dec.pbmc) set.seed(100000) sce.pbmc <- runTSNE(sce.pbmc, dimred="PCA") set.seed(1000000) sce.pbmc <- runUMAP(sce.pbmc, dimred="PCA") ### Clustering g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA') clust <- igraph::cluster_walktrap(g)$membership colLabels(sce.pbmc) <- factor(clust) table(colLabels(sce.pbmc)) #> #> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 #> 205 731 617 56 541 352 125 46 819 47 153 61 129 87 16 plotTSNE(sce.pbmc, colour_by="label") ### Interpretation markers <- findMarkers(sce.pbmc, pval.type="some", direction="up") marker.set <- markers[["8"]] as.data.frame(marker.set[1:30,1:3]) #> p.value FDR summary.logFC #> PF4 5.234138e-32 1.763591e-27 6.862880 #> TMSB4X 3.502960e-25 5.901437e-21 3.129070 #> TAGLN2 2.055571e-24 2.308680e-20 4.771441 #> NRGN 1.005824e-22 8.472562e-19 5.007984 #> SDPR 2.288275e-22 1.542023e-18 5.610425 #> PPBP 7.961199e-20 4.470744e-16 6.500820 #> GPX1 1.177137e-19 5.666066e-16 5.158546 #> CCL5 5.637712e-19 2.374463e-15 5.316314 #> GNG11 8.384893e-19 3.139118e-15 5.475652 #> HIST1H2AC 2.660666e-18 8.964847e-15 5.532573 #> TUBB1 7.919842e-18 2.425920e-14 4.987507 #> ACTB 4.073237e-17 1.058163e-13 3.171552 #> B2M 4.082661e-17 1.058163e-13 1.610689 #> FTH1 2.973798e-14 7.157083e-11 3.425641 #> RGS18 6.579466e-13 1.477924e-09 4.298459 #> ACRBP 1.357416e-12 2.858549e-09 3.969306 #> [ reached 'max' / getOption("max.print") -- omitted 14 rows ] plotExpression(sce.pbmc, features=c("CD14", "CD68", "MNDA", "FCGR3A"), x="label", colour_by="label") 11.11.1 Using existing references The celldex contains a number of curated reference datasets, mostly assembled from bulk RNA-seq or microarray data of sorted cell types. These references are often good enough for most applications provided that they contain the cell types that are expected in the test population. Here, we will use a reference constructed from Blueprint and ENCODE data (Martens and Stunnenberg 2013; The ENCODE Project Consortium 2012); this is obtained by calling the BlueprintEncode() function to construct a SummarizedExperiment containing log-expression values with curated labels for each sample. library("celldex") #> #> Attaching package: 'celldex' #> The following objects are masked from 'package:scRNAseq': #> #> fetchLatestVersion, fetchMetadata, listVersions ref <- BlueprintEncodeData() ref #> class: SummarizedExperiment #> dim: 19859 259 #> metadata(0): #> assays(1): logcounts #> rownames(19859): TSPAN6 TNMD ... LINC00550 GIMAP1-GIMAP5 #> rowData names(0): #> colnames(259): mature.neutrophil CD14.positive..CD16.negative.classical.monocyte ... #> epithelial.cell.of.umbilical.artery.1 dermis.lymphatic.vessel.endothelial.cell.1 #> colData names(3): label.main label.fine label.ont We call the SingleR() function to annotate each of our PBMCs with the main cell type labels from the Blueprint/ENCODE reference. This returns a DataFrame where each row corresponds to a cell in the test dataset and contains its label assignments. Alternatively, we could use the labels in ref$label.fine, which provide more resolution at the cost of speed and increased ambiguity in the assignments. library("SingleR") #> #> Attaching package: 'SingleR' #> The following objects are masked from 'package:celldex': #> #> BlueprintEncodeData, DatabaseImmuneCellExpressionData, HumanPrimaryCellAtlasData, ImmGenData, #> MonacoImmuneData, MouseRNAseqData, NovershternHematopoieticData pred <- SingleR(test=sce.pbmc, ref=ref, labels=ref$label.main) table(pred$labels) #> #> B-cells CD4+ T-cells CD8+ T-cells DC Eosinophils Erythrocytes HSC Monocytes NK cells #> 549 772 1275 1 1 6 14 1116 251 Now, we can inspect the results using a heatmap of the per-cell and label scores. Ideally, each cell should exhibit a high score in one label relative to all of the others, indicating that the assignment to that label was unambiguous. In this particular case it is true for monocytes and B cells, whereas we see more ambiguity between CD4+ and CD8+ T cells (and to a lesser extent, NK cells). plotScoreHeatmap(pred) We now compare the assignments with the clustering results to determine the identity of each cluster using a Heatmap of the distribution of cells across labels and clusters in the 10X PBMC dataset. Here, several clusters are nested within the monocyte and B cell labels, indicating that the clustering represents finer subdivisions within the cell types. Interestingly, our clustering does not effectively distinguish between CD4+ and CD8+ T cell labels. This is probably due to the presence of other factors of heterogeneity within the T cell subpopulation (e.g., activation) that have a stronger influence on unsupervised methods than the a priori expected CD4+/CD8+ distinction. tab <- table(Assigned=pred$pruned.labels, Cluster=colLabels(sce.pbmc)) # Adding a pseudo-count of 10 to avoid strong color jumps with just 1 cell. library(pheatmap) pheatmap(log2(tab+10), color=colorRampPalette(c("white", "blue"))(101)) This highlights some of the differences between reference-based annotation and unsupervised clustering. The former explicitly focuses on aspects of the data that are known to be interesting, simplifying the process of biological interpretation. However, the cost is that the downstream analysis is restricted by the diversity and resolution of the available labels, a problem that is largely avoided by de novo identification of clusters. Applying both strategies to examine the agreement (or lack thereof) between reference label and cluster assignments could work for you. Any inconsistencies are not necessarily problematic due to the conceptual differences between the two approaches; indeed, one could use those discrepancies as the basis for further investigation to discover novel factors of variation in the data. We can also apply SingleR to single-cell reference datasets that are curated and supplied by the user. This is most obviously useful when we have an existing dataset that was previously (manually) annotated and we want to use that knowledge to annotate a new dataset in an automated manner. 11.11.2 Assigning cell labels from gene sets A related strategy is to explicitly identify sets of marker genes that are highly expressed in each individual cell. This does not require matching of individual cells to the expression values of the reference dataset, which is faster and more convenient when only the identities of the markers are available. For this example, we will be using the neuronal cell type markers derived from the Zeisel et al. (2015) study. library("scran") wilcox.z <- pairwiseWilcox(sce.zeisel, sce.zeisel$level1class, lfc=1, direction="up") markers.z <- getTopMarkers(wilcox.z$statistics, wilcox.z$pairs, pairwise=FALSE, n=50) lengths(markers.z) #> astrocytes_ependymal endothelial-mural interneurons microglia oligodendrocytes #> 78 85 120 69 82 #> pyramidal CA1 pyramidal SS #> 122 148 And our test dataset will be another brain scRNA-seq experiment from Tasic et al. (2016). library("scRNAseq") sce.tasic <- TasicBrainData() sce.tasic #> class: SingleCellExperiment #> dim: 24058 1809 #> metadata(0): #> assays(1): counts #> rownames(24058): 0610005C13Rik 0610007C21Rik ... mt_X57780 tdTomato #> rowData names(0): #> colnames(1809): Calb2_tdTpositive_cell_1 Calb2_tdTpositive_cell_2 ... Rbp4_CTX_250ng_2 Trib2_CTX_250ng_1 #> colData names(12): mouse_line cre_driver_1 ... secondary_type aibs_vignette_id #> reducedDimNames(0): #> mainExpName: NULL #> altExpNames(1): ERCC We are using the AUCell package to identify marker sets that are highly expressed in each cell. This method ranks genes by their expression values within each cell and constructs a response curve of the number of genes from each marker set that are present with increasing rank. It then computes the area under the curve (AUC) for each marker set, quantifying the enrichment of those markers among the most highly expressed genes in that cell. This is roughly similar to performing a Wilcoxon rank sum test between genes in and outside of the set, but involving only the top ranking genes by expression in each cell. library("GSEABase") #> Loading required package: annotate #> Loading required package: XML #> #> Attaching package: 'XML' #> The following object is masked from 'package:patchwork': #> #> free #> Loading required package: graph #> #> Attaching package: 'graph' #> The following object is masked from 'package:XML': #> #> addNode #> The following object is masked from 'package:circlize': #> #> degree #> The following object is masked from 'package:stringr': #> #> boundary library("AUCell") all.sets <- lapply(names(markers.z), function(x) { GeneSet(markers.z[[x]], setName=x) }) all.sets <- GeneSetCollection(all.sets) rankings <- AUCell_buildRankings(counts(sce.tasic), plotStats=FALSE, verbose=FALSE) cell.aucs <- AUCell_calcAUC(all.sets, rankings) #> Genes in the gene sets NOT available in the dataset: #> endothelial-mural: 8 (9% of 85) #> interneurons: 1 (1% of 120) #> oligodendrocytes: 2 (2% of 82) #> pyramidal CA1: 4 (3% of 122) #> pyramidal SS: 4 (3% of 148) results <- t(assay(cell.aucs)) head(results) #> gene sets #> cells astrocytes_ependymal endothelial-mural interneurons microglia oligodendrocytes #> Calb2_tdTpositive_cell_1 0.1285798 0.04210738 0.5480712 0.04845394 0.1291290 #> Calb2_tdTpositive_cell_2 0.1261887 0.04823270 0.4615967 0.02682648 0.1083978 #> Calb2_tdTpositive_cell_3 0.1030379 0.07177445 0.3679172 0.03582241 0.1345914 #> Calb2_tdTpositive_cell_4 0.1220786 0.04930379 0.5336098 0.05387632 0.1250108 #> Calb2_tdTpositive_cell_5 0.1531630 0.06033829 0.5062161 0.06655747 0.1151828 #> Calb2_tdTpositive_cell_6 0.1237204 0.09046280 0.3618004 0.03201310 0.1293656 #> gene sets #> cells pyramidal CA1 pyramidal SS #> Calb2_tdTpositive_cell_1 0.2306182 0.3381124 #> Calb2_tdTpositive_cell_2 0.2033824 0.2716657 #> Calb2_tdTpositive_cell_3 0.3217893 0.5137783 #> Calb2_tdTpositive_cell_4 0.2569572 0.3441631 #> Calb2_tdTpositive_cell_5 0.2109269 0.3030309 #> Calb2_tdTpositive_cell_6 0.4041339 0.5251548 We assign cell type identity to each cell in the test dataset by taking the marker set with the top AUC as the label for that cell. Our new labels mostly agree with the original annotation from Tasic et al. (2016), which is encouraging :) The only exception involves misassignment of oligodendrocyte precursors to astrocytes, which may be understandable given that they are derived from a common lineage. In the absence of prior annotation, a more general diagnostic check is to compare the assigned labels to cluster identities, under the expectation that most cells of a single cluster would have the same label (or, if multiple labels are present, they should at least represent closely related cell states). new.labels <- colnames(results)[max.col(results)] tab <- table(new.labels, sce.tasic$broad_type) tab #> #> new.labels Astrocyte Endothelial Cell GABA-ergic Neuron Glutamatergic Neuron Microglia Oligodendrocyte #> astrocytes_ependymal 43 2 0 0 0 0 #> endothelial-mural 0 27 0 0 0 0 #> interneurons 0 0 760 3 0 0 #> microglia 0 0 0 0 22 0 #> oligodendrocytes 0 0 1 0 0 38 #> pyramidal SS 0 0 0 809 0 0 #> #> new.labels Oligodendrocyte Precursor Cell Unclassified #> astrocytes_ependymal 21 4 #> endothelial-mural 0 2 #> interneurons 0 15 #> microglia 0 1 #> oligodendrocytes 1 0 #> pyramidal SS 0 60 As a diagnostic measure, we examine the distribution of AUCs across cells for each label. In heterogeneous populations, the distribution for each label should be bimodal with one high-scoring peak containing cells of that cell type and a low-scoring peak containing cells of other types. The gap between these two peaks can be used to derive a threshold for whether a label is “active” for a particular cell. (In this case, we simply take the single highest-scoring label per cell as the labels should be mutually exclusive.) In populations where a particular cell type is expected, lack of clear bimodality for the corresponding label may indicate that its gene set is not sufficiently informative. par(mfrow=c(3,3)) AUCell_exploreThresholds(cell.aucs, plotHist=TRUE, assign=TRUE) #> $astrocytes_ependymal #> $astrocytes_ependymal$aucThr #> $astrocytes_ependymal$aucThr$selected #> minimumDens #> 0.04144623 #> #> $astrocytes_ependymal$aucThr$thresholds #> threshold nCells #> Global_k1 0.20913180 93 #> L_k2 0.20910138 93 #> R_k3 0.57911351 43 #> minimumDens 0.04144623 1808 #> #> $astrocytes_ependymal$aucThr$comment #> [1] "" #> #> #> $astrocytes_ependymal$assignment #> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4" #> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8" #> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12" #> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16" #> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20" #> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24" #> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_28" #> [29] "Calb2_tdTpositive_cell_29" "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32" #> [33] "Calb2_tdTpositive_cell_33" "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36" #> [37] "Calb2_tdTpositive_cell_37" "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40" #> [41] "Calb2_tdTpositive_cell_41" "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44" #> [45] "Calb2_tdTpositive_cell_45" "Calb2_tdTpositive_cell_46" "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_48" #> [49] "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_50" #> [ reached getOption("max.print") -- omitted 1758 entries ] #> #> #> $`endothelial-mural` #> $`endothelial-mural`$aucThr #> $`endothelial-mural`$aucThr$selected #> R_k3 #> 0.2463287 #> #> $`endothelial-mural`$aucThr$thresholds #> threshold nCells #> Global_k1 0.1196348 125 #> L_k2 0.1653355 53 #> R_k3 0.2463287 28 #> #> $`endothelial-mural`$aucThr$comment #> [1] "Few cells have high AUC values (0.018% cells with AUC> 0.20). " #> #> #> $`endothelial-mural`$assignment #> [1] "Ctgf_tdTpositive_cell_1" "Ctgf_tdTpositive_cell_2" "Ctgf_tdTpositive_cell_3" "Ctgf_tdTpositive_cell_4" #> [5] "Ctgf_tdTpositive_cell_5" "Ctgf_tdTpositive_cell_6" "Ctgf_tdTpositive_cell_7" "Ctgf_tdTpositive_cell_8" #> [9] "Ctgf_tdTpositive_cell_10" "Cux2_tdTnegative_cell_10" "Ndnf_tdTpositive_cell_1" "Ndnf_tdTpositive_cell_3" #> [13] "Ndnf_tdTpositive_cell_4" "Ndnf_tdTpositive_cell_5" "Ndnf_tdTpositive_cell_6" "Ndnf_tdTpositive_cell_7" #> [17] "Ndnf_tdTpositive_cell_8" "Ndnf_tdTpositive_cell_9" "Ndnf_tdTpositive_cell_10" "Ndnf_tdTpositive_cell_11" #> [21] "Ndnf_tdTpositive_cell_12" "Ndnf_tdTpositive_cell_20" "Ndnf_tdTpositive_cell_21" "Nos1_tdTpositive_cell_1" #> [25] "Nos1_tdTpositive_cell_28" "Nos1_tdTpositive_cell_54" "Nos1_tdTpositive_cell_66" "Ntsr1_tdTnegative_cell_29" #> #> #> $interneurons #> $interneurons$aucThr #> $interneurons$aucThr$selected #> minimumDens #> 0.2102008 #> #> $interneurons$aucThr$thresholds #> threshold nCells #> Global_k1 0.4912184 482 #> L_k2 0.2253667 1644 #> R_k3 0.4116720 960 #> minimumDens 0.2102008 1646 #> #> $interneurons$aucThr$comment #> [1] "The right distribution is taller. " #> #> #> $interneurons$assignment #> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4" #> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8" #> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12" #> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16" #> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20" #> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24" #> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_29" #> [29] "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32" "Calb2_tdTpositive_cell_33" #> [33] "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36" "Calb2_tdTpositive_cell_37" #> [37] "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40" "Calb2_tdTpositive_cell_41" #> [41] "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44" "Calb2_tdTpositive_cell_45" #> [45] "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_51" "Calb2_tdTpositive_cell_52" #> [49] "Calb2_tdTpositive_cell_54" "Calb2_tdTpositive_cell_55" #> [ reached getOption("max.print") -- omitted 1596 entries ] #> #> #> $microglia #> $microglia$aucThr #> $microglia$aucThr$selected #> R_k3 #> 0.4649278 #> #> $microglia$aucThr$thresholds #> threshold nCells #> tenPercentOfMax 0.06006924 474 #> Global_k1 0.09814942 88 #> L_k2 0.11821099 58 #> R_k3 0.46492785 23 #> #> $microglia$aucThr$comment #> [1] "Few cells have high AUC values (0.013% cells with AUC> 0.20). " #> #> #> $microglia$assignment #> [1] "Cux2_tdTnegative_cell_5" "Cux2_tdTnegative_cell_6" "Cux2_tdTnegative_cell_12" #> [4] "Cux2_tdTnegative_cell_15" "Ntsr1_tdTnegative_cell_18" "Ntsr1_tdTnegative_cell_28" #> [7] "Rbp4_tdTnegative_cell_2" "Rbp4_tdTnegative_cell_3" "Rbp4_tdTnegative_cell_4" #> [10] "Rbp4_tdTnegative_cell_10" "Rbp4_tdTnegative_cell_14" "Rbp4_tdTnegative_cell_16" #> [13] "Rbp4_tdTnegative_cell_19" "Rbp4_tdTnegative_cell_21" "Rbp4_tdTnegative_cell_22" #> [16] "Rbp4_tdTnegative_cell_23" "Scnn1a-Tg3_tdTnegative_cell_2" "Scnn1a-Tg3_tdTnegative_cell_4" #> [19] "Scnn1a-Tg3_tdTnegative_cell_8" "Scnn1a-Tg3_tdTnegative_cell_12" "Scnn1a-Tg3_tdTnegative_cell_16" #> [22] "Scnn1a-Tg3_tdTnegative_cell_21" "Scnn1a-Tg3_tdTnegative_cell_24" #> #> #> $oligodendrocytes #> $oligodendrocytes$aucThr #> $oligodendrocytes$aucThr$selected #> R_k3 #> 0.5673453 #> #> $oligodendrocytes$aucThr$thresholds #> threshold nCells #> Global_k1 0.2062242 82 #> L_k2 0.2302351 65 #> R_k3 0.5673453 34 #> #> $oligodendrocytes$aucThr$comment #> [1] "Few cells have high AUC values (0.048% cells with AUC> 0.20). " #> #> #> $oligodendrocytes$assignment #> [1] "Gad2_tdTpositive_cell_31" "Gad2_tdTpositive_cell_44" "Gad2_tdTpositive_cell_77" #> [4] "Ntsr1_tdTnegative_cell_3" "Ntsr1_tdTnegative_cell_5" "Ntsr1_tdTnegative_cell_13" #> [7] "Ntsr1_tdTnegative_cell_16" "Ntsr1_tdTnegative_cell_19" "Ntsr1_tdTnegative_cell_22" #> [10] "Ntsr1_tdTnegative_cell_23" "Ntsr1_tdTnegative_cell_24" "Ntsr1_tdTnegative_cell_32" #> [13] "Ntsr1_tdTnegative_cell_41" "Ntsr1_tdTnegative_cell_44" "Pvalb_tdTpositive_cell_81" #> [16] "Rbp4_tdTnegative_cell_1" "Rbp4_tdTnegative_cell_7" "Rbp4_tdTnegative_cell_12" #> [19] "Rbp4_tdTnegative_cell_15" "Rbp4_tdTnegative_cell_18" "Scnn1a-Tg2_tdTnegative_cell_3" #> [22] "Scnn1a-Tg2_tdTnegative_cell_17" "Scnn1a-Tg3_tdTnegative_cell_1" "Scnn1a-Tg3_tdTnegative_cell_3" #> [25] "Scnn1a-Tg3_tdTnegative_cell_6" "Scnn1a-Tg3_tdTnegative_cell_10" "Scnn1a-Tg3_tdTnegative_cell_13" #> [28] "Scnn1a-Tg3_tdTnegative_cell_14" "Scnn1a-Tg3_tdTnegative_cell_15" "Scnn1a-Tg3_tdTnegative_cell_18" #> [31] "Scnn1a-Tg3_tdTnegative_cell_19" "Scnn1a-Tg3_tdTnegative_cell_22" "Scnn1a-Tg3_tdTnegative_cell_23" #> [34] "Sst_tdTpositive_cell_19" #> #> #> $`pyramidal CA1` #> $`pyramidal CA1`$aucThr #> $`pyramidal CA1`$aucThr$selected #> minimumDens #> 0.1269473 #> #> $`pyramidal CA1`$aucThr$thresholds #> threshold nCells #> Global_k1 0.3328189 542 #> L_k2 0.6311996 0 #> R_k3 0.1545174 1646 #> minimumDens 0.1269473 1649 #> #> $`pyramidal CA1`$aucThr$comment #> [1] "The global distribution overlaps the partial distributions. " #> #> #> $`pyramidal CA1`$assignment #> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4" #> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8" #> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12" #> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16" #> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20" #> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24" #> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_29" #> [29] "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32" "Calb2_tdTpositive_cell_33" #> [33] "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36" "Calb2_tdTpositive_cell_37" #> [37] "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40" "Calb2_tdTpositive_cell_41" #> [41] "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44" "Calb2_tdTpositive_cell_45" #> [45] "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_51" "Calb2_tdTpositive_cell_52" #> [49] "Calb2_tdTpositive_cell_54" "Calb2_tdTpositive_cell_55" #> [ reached getOption("max.print") -- omitted 1599 entries ] #> #> #> $`pyramidal SS` #> $`pyramidal SS`$aucThr #> $`pyramidal SS`$aucThr$selected #> minimumDens #> 0.1829744 #> #> $`pyramidal SS`$aucThr$thresholds #> threshold nCells #> Global_k1 0.4960449 630 #> L_k2 0.8889189 0 #> R_k3 0.4359221 835 #> minimumDens 0.1829744 1649 #> #> $`pyramidal SS`$aucThr$comment #> [1] "The global distribution overlaps the partial distributions. " #> #> #> $`pyramidal SS`$assignment #> [1] "Calb2_tdTpositive_cell_1" "Calb2_tdTpositive_cell_2" "Calb2_tdTpositive_cell_3" "Calb2_tdTpositive_cell_4" #> [5] "Calb2_tdTpositive_cell_5" "Calb2_tdTpositive_cell_6" "Calb2_tdTpositive_cell_7" "Calb2_tdTpositive_cell_8" #> [9] "Calb2_tdTpositive_cell_9" "Calb2_tdTpositive_cell_10" "Calb2_tdTpositive_cell_11" "Calb2_tdTpositive_cell_12" #> [13] "Calb2_tdTpositive_cell_13" "Calb2_tdTpositive_cell_14" "Calb2_tdTpositive_cell_15" "Calb2_tdTpositive_cell_16" #> [17] "Calb2_tdTpositive_cell_17" "Calb2_tdTpositive_cell_18" "Calb2_tdTpositive_cell_19" "Calb2_tdTpositive_cell_20" #> [21] "Calb2_tdTpositive_cell_21" "Calb2_tdTpositive_cell_22" "Calb2_tdTpositive_cell_23" "Calb2_tdTpositive_cell_24" #> [25] "Calb2_tdTpositive_cell_25" "Calb2_tdTpositive_cell_26" "Calb2_tdTpositive_cell_27" "Calb2_tdTpositive_cell_29" #> [29] "Calb2_tdTpositive_cell_30" "Calb2_tdTpositive_cell_31" "Calb2_tdTpositive_cell_32" "Calb2_tdTpositive_cell_33" #> [33] "Calb2_tdTpositive_cell_34" "Calb2_tdTpositive_cell_35" "Calb2_tdTpositive_cell_36" "Calb2_tdTpositive_cell_37" #> [37] "Calb2_tdTpositive_cell_38" "Calb2_tdTpositive_cell_39" "Calb2_tdTpositive_cell_40" "Calb2_tdTpositive_cell_41" #> [41] "Calb2_tdTpositive_cell_42" "Calb2_tdTpositive_cell_43" "Calb2_tdTpositive_cell_44" "Calb2_tdTpositive_cell_45" #> [45] "Calb2_tdTpositive_cell_47" "Calb2_tdTpositive_cell_49" "Calb2_tdTpositive_cell_51" "Calb2_tdTpositive_cell_52" #> [49] "Calb2_tdTpositive_cell_54" "Calb2_tdTpositive_cell_55" #> [ reached getOption("max.print") -- omitted 1599 entries ] Interpretation of the AUCell results is most straightforward when the marker sets are mutually exclusive, as shown above for the cell type markers. In other applications, one might consider computing AUCs for gene sets associated with signalling or metabolic pathways. It is likely that multiple pathways will be active in any given cell, and it is tempting to use the AUCs to quantify this activity for comparison across cells. However, such comparisons must be interpreted with much caution as the AUCs are competitive values: any increase in one pathway’s activity will naturally reduce the AUCs for all other pathways, potentially resulting in spurious differences across the population. the advantage of the AUCell approach is that it does not require reference expression values. This is particularly useful when dealing with gene sets derived from the literature or other qualitative forms of biological knowledge. 11.12 References Amezquita, R. A., Lun, A. T., Becht, E., Carey, V. J., Carpp, L. N., Geistlinger, L., … & Hicks, S. C. (2020). Orchestrating single-cell analysis with Bioconductor. Nature methods, 17(2), 137-145. Aran, D., A. P. Looney, L. Liu, E. Wu, V. Fong, A. Hsu, S. Chak, et al. 2019. “Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage.” Nat. Immunol. 20 (2): 163–72. Bach, K., S. Pensa, M. Grzelak, J. Hadfield, D. J. Adams, J. C. Marioni, and W. T. Khaled. 2017. “Differentiation dynamics of mammary epithelial cells revealed by single-cell RNA sequencing.” Nat Commun 8 (1): 2128. Martens, J. H., and H. G. Stunnenberg. 2013. “BLUEPRINT: mapping human blood cell epigenomes.” Haematologica 98 (10): 1487–9. Muraro, M. J., G. Dharmadhikari, D. Grun, N. Groen, T. Dielen, E. Jansen, L. van Gurp, et al. 2016. “A Single-Cell Transcriptome Atlas of the Human Pancreas.” Cell Syst 3 (4): 385–94. Segerstolpe, A., A. Palasantza, P. Eliasson, E. M. Andersson, A. C. Andreasson, X. Sun, S. Picelli, et al. 2016. “Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes.” Cell Metab. 24 (4): 593–607. Tasic, B., V. Menon, T. N. Nguyen, T. K. Kim, T. Jarsky, Z. Yao, B. Levi, et al. 2016. “Adult mouse cortical cell taxonomy revealed by single cell transcriptomics.” Nat. Neurosci. 19 (2): 335–46. The ENCODE Project Consortium. 2012. “An integrated encyclopedia of DNA elements in the human genome.” Nature 489 (7414): 57–74. Zeisel, A., A. B. Munoz-Manchado, S. Codeluppi, P. Lonnerberg, G. La Manno, A. Jureus, S. Marques, et al. 2015. “Brain structure. Cell types in the mouse cortex and hippocampus revealed by single-cell RNA-seq.” Science 347 (6226): 1138–42. "],["introduction-to-spatial-transcriptomics.html", "12 Introduction to spatial transcriptomics 12.1 3’ Visium spatial technology 12.2 Spatial data visualization Bibliography", " 12 Introduction to spatial transcriptomics Instructor: Daianna Gonzalez-Padilla You might also be interested in this recent blog post by Leo https://lcolladotor.github.io/2024/05/23/humanpilot-first-spatially-resolved-transcriptomics-study-using-visium/ and the companion walk through video For a journal club presentation on the HumanPilot paper, check this video: In recent years, with constant improvements in the current sequencing technologies and the generation of more sophisticated omics methodologies and bioinformatic pipelines, we have been constantly demonstrating that specific cell types and cell-to-cell interactions play critical roles in the definition of numerous diseases and development-related processes. In fact, cell type-specific associations have been established for a number of diseases and disorders. Thus, understanding the cellular context and the spatial location in which normal and deregulated cellular events occur is necessary to unveil the molecular underpinnings of disease pathologies and malfunctions of the organisms. Spatial transcriptomics technologies are molecular profiling methods developed to measure gene expression levels in a tissue sample at the spatial resolution. These methods have been improved and expanded over time and are widely applied to study a wide range of biological processes and have provided numerous insights into disease and development mechanisms. In particular, the 10x Genomics Visium platform is a technology that spatially profiles the transcriptome of frozen and fixed tissue sections in combination with histology. 12.1 3’ Visium spatial technology This is the Visium technology more frequently used and it captures polyadenilated transcripts within individual spatially barcoded spots. In the Visium expression slide there are 4 capture areas, each of 6.5 (+1.5) mm\\(^2\\) with ~5k barcoded spots (55 µm in diameter each), within which mRNAs are captured by polyT primers that contain a read for sequencing (see below), a UMI (unique molecular identifier), and a spatial barcode. In this way, all RNAs trapped in the same spot are tagged with the same spot-specific barcode and we can computationally trace the original location of the transcripts. Figure 1: Schematic representation of the Visium capture areas and spots. Source: SciLifeLab (2023). 12.2 Spatial data visualization In order to interactively visualize example spatial data we’ll use the shiny web application of spatialLIBD: http://spatial.libd.org/spatialLIBD/. This web application allows to browse the human dorsolateral pre-frontal cortex (DLPFC) spatial transcriptomics data generated at the LIBD using the 10x Genomics Visium platform. In total there are 12 DLPFC tissue sections from 3 donors, each spanning six classical histological layers plus the white matter (WM). Figure 2: Human DLPFC tissue section. Spot plot depicting the 6 classical histological layers (L1-L6) and the white matter (WM) in a human DLPFC sample. 12.2.1 Spot-level data exploration With this tool you can: Observe per-spot QC metrics and gene expression levels Explore spot clusters in the tissue sections Visualize the spot data on reduced dimensions Manually annotate spots to layers and export your manual annotations Customize the spatial images p.exercise { background-color: #FFFAFA; padding: 15px; border: 2px solid black; margin-left: 0px; border-radius: 1px; font-family: sans-serif; } 📝 Exercise 1: visualize the clustering of spots in all tissue sections using the different discrete variables to plot. Which one recapitulates better the six histological layers (plus the white matter) of the human DLPFC? 📝 Exercise 2: explore the expression of SNAP25 (neuronal marker gene), MOBP (oligodendrocyte/WM marker gene), and PCP4 (layer 5 marker gene) in each DLPFC tissue section. What do you observe? Are there any spatial patterns in the expression of these genes? 12.2.2 Layer-level data exploration Layer-level data result from pseudo-bulking the spot-level data, i.e. from aggregating spot data from all spots assigned to a given layer. At this level the tool allows to: Visualize the gene expression data at the layer level in reduced dimensions Plot the layer-level lognorm or raw expression of a gene across all tissue sections and extract DEGs among layers (ANOVA model), in a specific layer compared to the rest (enrichment model) or compared to another layer (pairwise model) Assess the enrichment of your own sets of genes of interest among the DEGs from these spatial DLPFC data Correlate gene-wise statistics for DE between sn/scRNA-seq data clusters/cell populations with the DE statistics in the human DLPFC layers provided in this study. This can be used to label your sn/scRNA-seq groups or clusters with the more molecularly-defined histological layers 📝 Exercise 3: plot the expression of SNAP25, MOBP, and PCP4 in the different layers of each DLPFC tissue section. Are there any significant differences in the expression of these genes between layers under any of the statistical models for DGE? p.link{ background-color: #FFFFFF; padding: 10px; border: 0px solid black; margin-left: 0px; border-radius: 1px; font-size: 13px; font-family: sans-serif; } 👉🏼 There is also the spatialLIBD R/Bioconductor package you can use to interactively inspect your own spatial data in a shiny web app. Bibliography SciLifeLab (2023). 10X Genomics Visium for Fresh Frozen samples. Web site: https://ngisweden.scilifelab.se/methods/10x-visium/ 10x Genomics (n.d.). Whole transcriptome discovery in the tissue context. Web site: https://www.10xgenomics.com/platforms/visium "],["re-use-of-bulk-rna-seq-methods-for-spatial-data-exercise.html", "13 Re-use of bulk RNA-seq methods for spatial data exercise 13.1 Spatial registration 13.2 Exercise", " 13 Re-use of bulk RNA-seq methods for spatial data exercise Instructor: Leo New in @sciencemagazine: our work from @LieberInstitute #spatialDLPFC applies #snRNAseq and #Visium spatial transcriptomic in the DLPFC to better understand anatomical structure and cellular populations in the human brain #PsychENCODE https://t.co/DKZqmG4YDi https://t.co/Tjp2OjTo63 pic.twitter.com/vQbjts2JtQ — Louise Huuki-Myers (@lahuuki) May 23, 2024 13.1 Spatial registration In 2023, Louise A. Huuki-Myers contributed a new vignette to spatialLIBD as noted on the package news / changelog: http://research.libd.org/spatialLIBD/news/index.html#spatiallibd-1132. You should be able to run without any issues the code Louise explained at http://research.libd.org/spatialLIBD/articles/guide_to_spatial_registration.html. This same information is displayed at https://bioconductor.org/packages/release/data/experiment/vignettes/spatialLIBD/inst/doc/guide_to_spatial_registration.html. ## get reference layer enrichment statistics layer_modeling_results <- spatialLIBD::fetch_data(type = "modeling_results") #> adding rname 'https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1' #> 2024-06-11 11:00:48.91264 loading file /github/home/.cache/R/BiocFileCache/47672ee9738_Human_DLPFC_Visium_modeling_results.Rdata%3Fdl%3D1 If the above doesn’t work, related to the curl issue we previously discussed, then use this workaround: tmp_modeling_results <- tempfile("modeling_results.RData") download.file( "https://www.dropbox.com/s/se6rrgb9yhm5gfh/Human_DLPFC_Visium_modeling_results.Rdata?dl=1", tmp_modeling_results, mode = "wb" ) load(tmp_modeling_results, verbose = TRUE) #> Loading objects: #> modeling_results ## Let's rename the object into the name used in the ## spatial registration vignette (from spatialLIBD) layer_modeling_results <- modeling_results This journal club style video of the main results of the spatialDLPFC paper does explain the basics of spatial registration: For more on spatialDLPFC, check this second video about the supplementary results: 13.2 Exercise p.exercise { background-color: #E4EDE2; padding: 9px; border: 1px solid black; border-radius: 10px; font-family: sans-serif; } Exercise: Follow the vignette on spatial registration. Do the results change when you use cutoff_merge_ratio = 0.1? What is this argument controlling? "],["making-your-own-website-with-postcards.html", "14 Making your own website with postcards 14.1 here 14.2 Usethis 14.3 Git + GitHub 14.4 R websites 14.5 postcards 14.6 Create your own website with postcards! 14.7 References", " 14 Making your own website with postcards Instructor: Melissa Mayén Quiroz Welcome to “Making your own website with postcards”! Here we will explore essential tools and techniques to help you create and publish your own website using R and the postcards package. Content: here usethis Git + GitHub R websites postcards Create your own website with postcards! 14.1 here The here package is a powerful tool for managing file paths in your R projects. It helps you construct paths to files relative to your project’s root, ensuring your code is more robust and easier to share with others. Using here helps avoid issues with hard-coded paths and enhances the reproducibility of your analyses. The base directory it takes will be the one you are in when you load the here package, heuristically finding the root of the project and positioning itself there. In this case, the package is already installed so we just need to load it. ## Install the package manually # install.packages("here") ## Load "here" (previously installed) library("here") Sometimes there might be an error, as it might clash with other packages (like plyr). To avoid this, we can use here::here() (which basically clarifies that the requested function is from the here package). here::here() #> [1] "/__w/cshl_rstats_genome_scale_2024/cshl_rstats_genome_scale_2024" Some useful commands are getwd() and setwd(), which deal with the working directory, which is the default location where R looks for files to read or save. getwd() retrieves the current working directory. setwd() allows changing the current working directory. getwd() # returns the current path setwd("desired/directory") # changes to the specified path Best Practice: Instead of using “setwd” to manually set your working directory, it is often better to use the “here” package. Using “here” avoids issues with hard-coded paths and ensures your scripts work regardless of the specific setup of your working environment. ## Instead of "C:/Users/user/Desktop/data/myfile.csv" ## Use here to construct file paths file_path <- here("Users", "user", "Desktop", "data", "myfile.csv") # file_path <- here:here("Users", "user", "Desktop","data", "myfile.csv") data <- read.csv(file_path) Other examples of how “here” could be used: ## Example: save data to a file and load it a <- 1 c <- 23 save(a, c, file = here("test-data.RData")) # save(a, c, file = here:here("test-data.RData")) load(here("test-data.RData")) # load(here:here("test-data.RData")) ## Create a directory dir.create(here("subdirectory"), showWarnings = FALSE) # dir.create(here:here("subdirectory"), showWarnings = FALSE) ## Create a file, indicating the subdirectory (the first argument in this case) file.create(here("subdirectory", "filename")) #> [1] TRUE # file.create(here:here("subdirectory", "filename")) ## Open the new created file file.show(here("subdirectory", "filename")) # file.show(here:here("subdirectory", "filename")) ## For example, if we want to see our files in the directory list.files(here(), recursive = TRUE) #> [1] "_main_files/figure-html/assigned_vs_ann_heatmap-1.png" #> [2] "_main_files/figure-html/auc_explore_plots-1.png" #> [3] "_main_files/figure-html/CCA-1.png" #> [4] "_main_files/figure-html/cut_dendogram-1.png" #> [5] "_main_files/figure-html/cut_dendogram-2.png" #> [6] "_main_files/figure-html/EMM_example1-1.png" #> [7] "_main_files/figure-html/heat map-1.png" #> [8] "_main_files/figure-html/hist_libSizeFactors-1.png" #> [9] "_main_files/figure-html/hist_p-1.png" #> [10] "_main_files/figure-html/lessRes_clustering-1.png" #> [11] "_main_files/figure-html/modelGeneVar_batch-1.png" #> [12] "_main_files/figure-html/modelGeneVar_zeisel-1.png" #> [13] "_main_files/figure-html/modelGeneVarByPoisson_zeisel-1.png" #> [14] "_main_files/figure-html/modelGeneVarWithSpikes_416b-1.png" #> [15] "_main_files/figure-html/PCs_zeisel-1.png" #> [16] "_main_files/figure-html/plot_clusters_zeisel-1.png" #> [17] "_main_files/figure-html/plot_dendogram-1.png" #> [18] "_main_files/figure-html/plot_markergenes1-1.png" #> [19] "_main_files/figure-html/plot_markers_byblock-1.png" #> [20] "_main_files/figure-html/Plot_multiplePCA_PCs-1.png" #> [21] "_main_files/figure-html/plotDots_markers-1.png" #> [22] "_main_files/figure-html/predicted_vs_clusters_heatmap-1.png" #> [23] "_main_files/figure-html/QC_sce416b_plots-1.png" #> [24] "_main_files/figure-html/runTSNE_zeisel-1.png" #> [25] "_main_files/figure-html/set_PBMC_dataset-1.png" #> [26] "_main_files/figure-html/set_PBMC_dataset-2.png" #> [27] "_main_files/figure-html/top_markers_heatmap-1.png" #> [28] "_main_files/figure-html/TSNE_perplexity_plots-1.png" #> [29] "_main_files/figure-html/Umap_zeisel-1.png" #> [30] "_main_files/figure-html/unnamed-chunk-14-1.png" #> [31] "_main_files/figure-html/unnamed-chunk-15-1.png" #> [32] "_main_files/figure-html/unnamed-chunk-16-1.png" #> [33] "_main_files/figure-html/unnamed-chunk-17-1.png" #> [34] "_main_files/figure-html/unnamed-chunk-18-1.png" #> [35] "_main_files/figure-html/unnamed-chunk-19-1.png" #> [36] "_main_files/figure-html/VarExplained_PCs-1.png" #> [37] "_main_files/figure-html/volcano plot-1.png" #> [38] "_main_files/figure-html/voom-1.png" #> [39] "_main.Rmd" #> [40] "01_SummarizedExperiment.R" #> [41] "01_SummarizedExperiment.Rmd" #> [42] "02_iSEE.R" #> [43] "02_iSEE.Rmd" #> [44] "03_recount3_intro.R" #> [45] "03_recount3_intro.Rmd" #> [46] "04_DGE_analysis_overview.R" #> [47] "04_DGE_analysis_overview.Rmd" #> [48] "05_DGE_with_limma_voom.R" #> [49] "05_DGE_with_limma_voom.Rmd" #> [50] "06_ExploreModelMatrix.R" #> [ reached getOption("max.print") -- omitted 68 entries ] # list.files(here:here(), recursive = TRUE) 14.2 Usethis The usethis package simplifies many common setup tasks and workflows in R. It helps streamline the process of creating new projects, setting up Git repositories, and connecting with GitHub. Mastering usethis allows you to focus more on coding and less on configuration. In this case, the package is already installed so we just need to load it. ## Install the package manually # install.packages("usethis") ## Load "usethis (previously installed) library("usethis") Usage: All use_*() functions operate on the current directory. ## usethis::use_*() usethis::use_r() usethis::use_git() usethis::use_readme_md() ✔ indicates that usethis has setup everything for you. ● indicates that you’ll need to do some work yourself. ## For example, create a README file usethis::use_readme_md() #> ✔ Setting active project to '/__w/cshl_rstats_genome_scale_2024/cshl_rstats_genome_scale_2024' #> ✔ Writing 'README.md' More functions in usethis: usethis RDocumentation In the following exercises, we will see some uses of usethis. 14.3 Git + GitHub GitHub An Intro to Git and GitHub for Beginners (Tutorial) by HubSpot Version control is a critical skill. Git helps you track changes in your projects, collaborate with others, and maintain a history of your work. GitHub, a platform for hosting Git repositories, enables seamless collaboration and sharing of your projects with the world. Understanding Git and GitHub ensures your projects are well-organized and accessible. 14.3.1 Prerequisites We need a GitHub account. If you don’t have one, now is the time to create it! Create a GitHub account We also need to install Git on our computers as the gitcreds package requires it. Installing Git After installing Git, restart RStudio to allow it to annex. In this case, the packages are already installed so we just need to load them. # install.packages(c("gitcreds", "gert", "gh")) ## Load them separately library("gitcreds") library("gert") library("gh") 14.3.2 Creating a personal access token (PAT) To connect our RStudio repository with GitHub, we request a token, which allows GitHub to grant permission to our computer. You can request the token using R (choose a meaningful name). ## Initiate connection with GitHub usethis::create_github_token() # redirects to GitHub where you'll choose a specific name for the token Copy the token to enter it later with gitcreds_set() gitcreds::gitcreds_set() # here you place the token (NOT your GitHub password!!!) Another way to request the token is by going to GitHub Tokens, this option will provide a recommendation of the parameters to select. The token expiration parameter can be changed so it does not expire (for security, GitHub does not recommend this). Otherwise, consider its validity period. Once generated, you must save the token, as it will not appear again. You can always generate a new one (don’t forget to delete the previous token). The next step is to configure our GitHub user in the global .gitconfig file: ## Configure GitHub user usethis::edit_git_config() # opens the global .gitconfig file ## Place the name and email of your GitHub account. ## JUST remove the "#" and respect the other spaces # [user] # name = N A M E # email = github_email 14.3.3 Initialize Git and GitHub repository Now let’s initialize the repository in Git (locally on your computer) and then request to connect it with GitHub servers. Git is the software while GitHub is the web platform (based on Git) that allows collaboration. ## Initialize the Git repository usethis::use_git() ## Connect your local Git repository with GitHub servers usethis::use_github() ** Done ** Useful command to check configuration: gh::gh_whoami() 14.3.4 Some other gert commands Once we have linked our repository with GitHub, we can continue updating it. Some useful commands for this are: git_add() git_commit() git_log() git_push() ## Write a new file, using here::here to specify the path writeLines("hello", here::here("R", "test-here.R")) ## Another way is to use use_r usethis::use_r("test-file-github.R") # adds file to the project's R directory ## For example, we might try adding something new gert::git_add("R/test-file-github.R") ## Add commit of what was done gert::git_commit("uploaded test file") ## Gives info about the commits gert::git_log() ## Upload your changes from the local repo to GitHub gert::git_push() # IMPORTANT COMMAND It might be more user-friendly to use the Git pane that appears in RStudio :) 14.4 R websites Creating websites using R opens up new ways to share your analyses, reports, and research. Whether you are building static sites with R Markdown or dynamic applications with Shiny, R provides powerful tools to make your content interactive and engaging. Learning to create and deploy R websites enhances your ability to communicate your work effectively. 14.4.1 1. Set Up _site.yml Creating a website with R Markdown involves several key steps. First, you set up a _site.yml file, which configures the site’s name, navigation bar, and global options like themes and additional CSS or JavaScript files. This file ensures a consistent look and feel across all pages. YAML (.yml file) name: "My Website" output_dir: "docs" navbar: title: "My Website" left: - text: "Home" href: index.html - text: "About" href: about.html output: html_document: theme: cosmo highlight: tango 14.4.2 2. Create index.Rmd for the Homepage The homepage is created using an index.Rmd file, which acts as the main entry point for visitors, providing an introduction or overview of the site. Additional pages, such as about.Rmd, offer more detailed information about the website or its author. Markdown (index.Rmd file) --- title: "Welcome to My Website" author: "Your Name" date: "2024-06-11" output: html_document --- # Welcome to My Website This is a website created with R Markdown. Here you can share your analyses, reports, and research. ## Example Section Here is an example of a simple analysis: ## To insert a code block follow the sintaxis removing "#" !!! #` ``{r} summary(cars) # ``` 14.4.3 3. Render the Site To render the site, use the rmarkdown::render_site() function, which converts all R Markdown and Markdown files into HTML. The resulting HTML files and resources are placed in a directory, typically _site. RStudio facilitates this process with tools like the “Knit” button for individual pages and the “Build” pane for the entire site. Common elements, such as shared HTML files and CSS for styling, ensure consistency and avoid redundancy. A well-configured navigation bar enhances user experience by providing easy access to different sections. rmarkdown::render_site() 14.4.4 4. Publish the Website Publishing involves copying the contents of the _site directory to a web server, making your site accessible to others. For example, if you’re creating a personal blog, you would set up the _site.yml file with your site’s title and navigation links. The index.Rmd file would introduce your blog, while about.Rmd would provide information about you. After writing your blog posts in R Markdown files and rendering the site, you would upload the _site directory to your web server. 14.4.4.1 Choose a Hosting Platform: Consider platforms like GitHub Pages or Netlify for easy and free hosting. 14.4.4.2 Upload Files: For GitHub Pages, push your files to a GitHub repository named username.github.io. For Netlify, connect your GitHub repository and configure the deployment settings. 14.4.4.3 Configure Hosting: On GitHub Pages, enable GitHub Pages in the repository settings. On Netlify, configure the deployment settings to specify the build command (rmarkdown::render_site()) and output directory (docs if using _site.yml). Continuous Deployment (Netlify). If hosting on a different server, manually upload the files to your server using FTP or a similar method. 14.5 postcards The postcards package makes it easy to create beautiful, single-page websites with minimal effort. It’s perfect for personal websites, portfolios, and project showcases. Using postcards allows you to present your work professionally and creatively, without needing extensive web development knowledge. A collection of R Markdown templates for creating simple and easy-to-personalize single-page websites. “The goal of the package is to make it easy for anyone to create a one-page personal website using an R Markdown document.” Author: Sean Kross [aut, cre] Maintainer: Sean Kross <sean at seankross.com> https://CRAN.R-project.org/package=postcards GitHub: https://github.com/seankross/postcards Similar to https://pages.github.com/ Your webpage should say something about you, your interests, and your projects, as well as how to contact you. Some examples: https://amy-peterson.github.io/ via https://github.com/amy-peterson/amy-peterson.github.com http://jtleek.com/ via https://github.com/jtleek/jtleek.github.io http://aejaffe.com/ via https://github.com/andrewejaffe/andrewejaffe.github.io https://hadley.nz/ via https://github.com/hadley/hadley.github.com https://emarquezz.github.io/ via https://github.com/emarquezz/emarquezz.github.io https://bpardo99.github.io/ via https://github.com/bpardo99/bpardo99.github.io https://daianna21.github.io/ via https://github.com/daianna21/daianna21.github.io. 14.5.1 Installation In this case, the package is already installed. ## You can install Postcards with the following command: # install.packages("postcards") ## Or you can install the latest development version (not recommended): # remotes::install_github("seankross/postcards@main") 14.5.2 Templates Postcards include five templates: Jolla, Jolla Blue, Trestles, Onofre, and Solana. Each site is optimized for viewing on both desktop and mobile devices. The goal of the package is to make it easy for anyone to create a one-page personal website using an R Markdown document. Jolla: Jolla Blue: Trestles: Onofre: Solana: To start personalizing one of these templates, you need to create a new project. 14.6 Create your own website with postcards! Create your own website: Following the next steps you will be able to create your own personal website. You will need to have a GitHub account and connect Git. In case you missed it, you can go back to the “Git + GitHub” section. 14.6.1 Create a New Project in RStudio (Interactive Selection) If you use RStudio: Select “File”, “New Project”… Choose “New Directory”, “Postcards Website” Enter a directory name for your project in RStudio (“Your_Username.github.io”) Choose one of the templates from a dropdown menu Select “Create Project” after choosing a name for the folder that will contain your site. This folder will contain two important files: An R Markdown document with your site’s content A sample photo you should replace (with your own) ## Create a new project usethis::create_project("Your_Username.github.io") 14.6.2 Set Up Git and GitHub To save changes, you need to set up Git and GitHub ## Set up Git and GitHub usethis::use_git() # Restart the session usethis::use_github() 14.6.3 Choose a Template ## Choose only one template (the one you like the most) postcards::create_postcard(template = "jolla") postcards::create_postcard(template = "jolla-blue") postcards::create_postcard(template = "trestles") postcards::create_postcard(template = "onofre") postcards::create_postcard(template = "solana") In this way, you will also get the 2 important files: An R Markdown document with your site’s content A sample photo you should replace 14.6.4 Edit with Your Information Now you should edit the R Markdown document with your information and replace the image with one of your choice :) Fill in your information using the Markdown format. For example, https://github.com/andrewejaffe/andrewejaffe.github.io/blob/master/index.Rmd#L17-L31. Add your profiles in the style of https://github.com/andrewejaffe/andrewejaffe.github.io/blob/master/index.Rmd#L7-L12 14.6.5 Deploy the Page To compile the self-contained HTML file for the site: In RStudio, you can use the “Knit” button or directly: ## Deploy the GitHub page rmarkdown::render("index.Rmd") ** Done ** 14.7 References https://comunidadbioinfo.github.io/cdsb2021_scRNAseq/ejercicio-usando-usethis-here-y-postcards.html#vinculando-rstudio-con-git-y-github https://here.r-lib.org/ https://usethis.r-lib.org/ https://rmarkdown.rstudio.com/lesson-13.html https://bookdown.org/yihui/rmarkdown/rmarkdown-site.html https://product.hubspot.com/blog/git-and-github-tutorial-for-beginners https://github.com/Melii99/rnaseq_2024_postcards/blob/master/Actividad_postcards.Rmd https://lcolladotor.github.io/jhustatcomputing2023/projects/project-0/ "],["final-r-session.html", "Final R Session", " Final R Session This is the final R session after all the code in this book is run sequentially. #> ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────── #> setting value #> version R version 4.4.0 (2024-04-24) #> os Ubuntu 22.04.4 LTS #> system x86_64, linux-gnu #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz UTC #> date 2024-06-11 #> pandoc 3.1.13 @ /usr/bin/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> abind 1.4-5 2016-07-21 [1] RSPM (R 4.4.0) #> airway * 1.24.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.base 1.4.1 2024-05-03 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.matrix 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.ranges 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.sce 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.schemas 1.4.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> alabaster.se 1.4.1 2024-05-21 [1] Bioconductor 3.19 (R 4.4.0) #> annotate * 1.82.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> AnnotationDbi * 1.66.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> AnnotationFilter * 1.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> AnnotationHub * 3.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> aod 1.3.3 2023-12-13 [1] RSPM (R 4.4.0) #> askpass 1.2.0 2023-09-03 [2] RSPM (R 4.4.0) #> attempt 0.3.1 2020-05-03 [1] RSPM (R 4.4.0) #> AUCell * 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> backports 1.5.0 2024-05-23 [1] RSPM (R 4.4.0) #> base64enc 0.1-3 2015-07-28 [2] RSPM (R 4.4.0) #> beachmat 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> beeswarm 0.4.0 2021-06-01 [1] RSPM (R 4.4.0) #> benchmarkme 1.0.8 2022-06-12 [1] RSPM (R 4.4.0) #> benchmarkmeData 1.0.4 2020-04-23 [1] RSPM (R 4.4.0) #> bibtex 0.5.1 2023-01-26 [1] RSPM (R 4.4.0) #> Biobase * 2.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocFileCache * 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocGenerics * 0.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocIO 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocManager 1.30.23 2024-05-04 [2] CRAN (R 4.4.0) #> BiocNeighbors 1.22.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocParallel * 1.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocSingular 1.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocStyle * 2.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> biocthis * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> BiocVersion 3.19.1 2024-04-17 [2] Bioconductor 3.19 (R 4.4.0) #> Biostrings 2.72.1 2024-06-02 [1] Bioconductor 3.19 (R 4.4.0) #> bit 4.0.5 2022-11-15 [1] RSPM (R 4.4.0) #> bit64 4.0.5 2020-08-30 [1] RSPM (R 4.4.0) #> bitops 1.0-7 2021-04-24 [1] RSPM (R 4.4.0) #> blob 1.2.4 2023-03-17 [1] RSPM (R 4.4.0) #> bluster * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> bookdown 0.39 2024-04-15 [1] RSPM (R 4.4.0) #> boot 1.3-30 2024-02-26 [3] CRAN (R 4.4.0) #> brio 1.1.5 2024-04-24 [2] RSPM (R 4.4.0) #> broom 1.0.6 2024-05-17 [1] RSPM (R 4.4.0) #> bslib 0.7.0 2024-03-29 [2] RSPM (R 4.4.0) #> cachem 1.1.0 2024-05-16 [2] RSPM (R 4.4.0) #> Cairo 1.6-2 2023-11-28 [1] RSPM (R 4.4.0) #> caTools 1.18.2 2021-03-28 [1] RSPM (R 4.4.0) #> celldex * 1.14.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) #> checkmate 2.3.1 2023-12-04 [1] RSPM (R 4.4.0) #> circlize * 0.4.16 2024-02-20 [1] RSPM (R 4.4.0) #> cli 3.6.2 2023-12-11 [2] RSPM (R 4.4.0) #> clue 0.3-65 2023-09-23 [1] RSPM (R 4.4.0) #> cluster 2.1.6 2023-12-01 [3] CRAN (R 4.4.0) #> codetools 0.2-20 2024-03-31 [3] CRAN (R 4.4.0) #> colorspace 2.1-0 2023-01-23 [1] RSPM (R 4.4.0) #> colourpicker 1.3.0 2023-08-21 [1] RSPM (R 4.4.0) #> ComplexHeatmap * 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> config 0.3.2 2023-08-30 [1] RSPM (R 4.4.0) #> corpcor 1.6.10 2021-09-16 [1] RSPM (R 4.4.0) #> cowplot * 1.1.3 2024-01-22 [1] RSPM (R 4.4.0) #> crayon 1.5.2 2022-09-29 [2] RSPM (R 4.4.0) #> credentials 2.0.1 2023-09-06 [2] RSPM (R 4.4.0) #> curl 5.2.1 2024-03-01 [1] RSPM (R 4.4.0) #> data.table 1.15.4 2024-03-30 [1] RSPM (R 4.4.0) #> DBI 1.2.3 2024-06-02 [1] RSPM (R 4.4.0) #> dbplyr * 2.5.0 2024-03-19 [1] RSPM (R 4.4.0) #> DelayedArray 0.30.1 2024-05-07 [1] Bioconductor 3.19 (R 4.4.0) #> DelayedMatrixStats 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> dendextend * 1.17.1 2023-03-25 [1] RSPM (R 4.4.0) #> desc 1.4.3 2023-12-10 [2] RSPM (R 4.4.0) #> digest 0.6.35 2024-03-11 [2] RSPM (R 4.4.0) #> doParallel 1.0.17 2022-02-07 [1] RSPM (R 4.4.0) #> dotCall64 1.1-1 2023-11-28 [1] RSPM (R 4.4.0) #> dplyr 1.1.4 2023-11-17 [1] RSPM (R 4.4.0) #> dqrng 0.4.1 2024-05-28 [1] RSPM (R 4.4.0) #> DropletTestFiles * 1.14.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) #> DropletUtils * 1.24.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> DT 0.33 2024-04-04 [1] RSPM (R 4.4.0) #> dynamicTreeCut * 1.63-1 2016-03-11 [1] RSPM (R 4.4.0) #> edgeR * 4.2.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> EnsDb.Hsapiens.v86 * 2.99.0 2024-05-27 [1] Bioconductor #> ensembldb * 2.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> EnvStats 2.8.1 2023-08-22 [1] RSPM (R 4.4.0) #> evaluate 0.24.0 2024-06-10 [2] RSPM (R 4.4.0) #> ExperimentHub * 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> ExploreModelMatrix * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> fANCOVA 0.6-1 2020-11-13 [1] RSPM (R 4.4.0) #> fansi 1.0.6 2023-12-08 [2] RSPM (R 4.4.0) #> farver 2.1.2 2024-05-13 [1] RSPM (R 4.4.0) #> fastmap 1.2.0 2024-05-15 [2] RSPM (R 4.4.0) #> fields 15.2 2023-08-17 [1] RSPM (R 4.4.0) #> filelock 1.0.3 2023-12-11 [1] RSPM (R 4.4.0) #> FNN 1.1.4 2024-01-12 [1] RSPM (R 4.4.0) #> foreach 1.5.2 2022-02-02 [1] RSPM (R 4.4.0) #> foreign 0.8-86 2023-11-28 [3] CRAN (R 4.4.0) #> Formula 1.2-5 2023-02-24 [1] RSPM (R 4.4.0) #> fs 1.6.4 2024-04-25 [2] RSPM (R 4.4.0) #> generics 0.1.3 2022-07-05 [1] RSPM (R 4.4.0) #> GenomeInfoDb * 1.40.1 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) #> GenomeInfoDbData 1.2.12 2024-05-26 [1] Bioconductor #> GenomicAlignments 1.40.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> GenomicFeatures * 1.56.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> GenomicRanges * 1.56.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> gert * 2.0.1 2023-12-04 [2] RSPM (R 4.4.0) #> GetoptLong 1.0.5 2020-12-15 [1] RSPM (R 4.4.0) #> ggbeeswarm 0.7.2 2023-04-29 [1] RSPM (R 4.4.0) #> ggplot2 * 3.5.1 2024-04-23 [1] RSPM (R 4.4.0) #> ggrepel * 0.9.5 2024-01-10 [1] RSPM (R 4.4.0) #> gh * 1.4.1 2024-03-28 [2] RSPM (R 4.4.0) #> gitcreds * 0.1.2 2022-09-08 [2] RSPM (R 4.4.0) #> GlobalOptions 0.1.2 2020-06-10 [1] RSPM (R 4.4.0) #> glue 1.7.0 2024-01-09 [2] RSPM (R 4.4.0) #> golem 0.4.1 2023-06-05 [1] RSPM (R 4.4.0) #> gplots 3.1.3.1 2024-02-02 [1] RSPM (R 4.4.0) #> graph * 1.82.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> gridExtra 2.3 2017-09-09 [1] RSPM (R 4.4.0) #> GSEABase * 1.66.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> gtable 0.3.5 2024-04-22 [1] RSPM (R 4.4.0) #> gtools 3.9.5 2023-11-20 [1] RSPM (R 4.4.0) #> gypsum 1.0.1 2024-05-08 [1] Bioconductor 3.19 (R 4.4.0) #> HDF5Array 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> here * 1.0.1 2020-12-13 [1] RSPM (R 4.4.0) #> highr 0.11 2024-05-26 [2] RSPM (R 4.4.0) #> Hmisc * 5.1-3 2024-05-28 [1] RSPM (R 4.4.0) #> htmlTable 2.4.2 2023-10-29 [1] RSPM (R 4.4.0) #> htmltools 0.5.8.1 2024-04-04 [2] RSPM (R 4.4.0) #> htmlwidgets 1.6.4 2023-12-06 [2] RSPM (R 4.4.0) #> httpuv 1.6.15 2024-03-26 [2] RSPM (R 4.4.0) #> httr 1.4.7 2023-08-15 [2] RSPM (R 4.4.0) #> httr2 1.0.1 2024-04-01 [2] RSPM (R 4.4.0) #> igraph 2.0.3 2024-03-13 [1] RSPM (R 4.4.0) #> IRanges * 2.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> irlba 2.3.5.1 2022-10-03 [1] RSPM (R 4.4.0) #> iSEE * 2.16.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> iterators 1.0.14 2022-02-05 [1] RSPM (R 4.4.0) #> jquerylib 0.1.4 2021-04-26 [2] RSPM (R 4.4.0) #> jsonlite 1.8.8 2023-12-04 [2] RSPM (R 4.4.0) #> KEGGREST 1.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> kernlab 0.9-32 2023-01-31 [1] RSPM (R 4.4.0) #> KernSmooth 2.23-24 2024-05-17 [3] RSPM (R 4.4.0) #> knitr 1.47 2024-05-29 [2] RSPM (R 4.4.0) #> labeling 0.4.3 2023-08-29 [1] RSPM (R 4.4.0) #> later 1.3.2 2023-12-06 [2] RSPM (R 4.4.0) #> lattice 0.22-6 2024-03-20 [3] CRAN (R 4.4.0) #> lazyeval 0.2.2 2019-03-15 [1] RSPM (R 4.4.0) #> lifecycle 1.0.4 2023-11-07 [2] RSPM (R 4.4.0) #> limma * 3.60.2 2024-05-19 [1] Bioconductor 3.19 (R 4.4.0) #> listviewer 4.0.0 2023-09-30 [1] RSPM (R 4.4.0) #> lme4 1.1-35.3 2024-04-16 [1] RSPM (R 4.4.0) #> lmerTest 3.1-3 2020-10-23 [1] RSPM (R 4.4.0) #> lobstr * 1.1.2 2022-06-22 [1] RSPM (R 4.4.0) #> locfit 1.5-9.9 2024-03-01 [1] RSPM (R 4.4.0) #> lubridate 1.9.3 2023-09-27 [1] RSPM (R 4.4.0) #> magick 2.8.3 2024-02-18 [1] RSPM (R 4.4.0) #> magrittr 2.0.3 2022-03-30 [2] RSPM (R 4.4.0) #> maps 3.4.2 2023-12-15 [1] RSPM (R 4.4.0) #> MASS 7.3-60.2 2024-05-06 [3] local #> Matrix 1.7-0 2024-03-22 [3] CRAN (R 4.4.0) #> MatrixGenerics * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> matrixStats * 1.3.0 2024-04-11 [1] RSPM (R 4.4.0) #> memoise 2.0.1 2021-11-26 [2] RSPM (R 4.4.0) #> metapod 1.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> mgcv 1.9-1 2023-12-21 [3] CRAN (R 4.4.0) #> mime 0.12 2021-09-28 [2] RSPM (R 4.4.0) #> miniUI 0.1.1.1 2018-05-18 [2] RSPM (R 4.4.0) #> minqa 1.2.7 2024-05-20 [1] RSPM (R 4.4.0) #> mixtools 2.0.0 2022-12-05 [1] RSPM (R 4.4.0) #> munsell 0.5.1 2024-04-01 [1] RSPM (R 4.4.0) #> mvtnorm 1.2-5 2024-05-21 [1] RSPM (R 4.4.0) #> nlme 3.1-165 2024-06-06 [3] RSPM (R 4.4.0) #> nloptr 2.0.3 2022-05-26 [1] RSPM (R 4.4.0) #> nnet 7.3-19 2023-05-03 [3] CRAN (R 4.4.0) #> numDeriv 2016.8-1.1 2019-06-06 [1] RSPM (R 4.4.0) #> openssl 2.2.0 2024-05-16 [2] RSPM (R 4.4.0) #> paletteer 1.6.0 2024-01-21 [1] RSPM (R 4.4.0) #> patchwork * 1.2.0 2024-01-08 [1] RSPM (R 4.4.0) #> pbkrtest 0.5.2 2023-01-19 [1] RSPM (R 4.4.0) #> pheatmap * 1.0.12 2019-01-04 [1] RSPM (R 4.4.0) #> pillar 1.9.0 2023-03-22 [2] RSPM (R 4.4.0) #> pkgconfig 2.0.3 2019-09-22 [2] RSPM (R 4.4.0) #> pkgload 1.3.4 2024-01-16 [2] RSPM (R 4.4.0) #> plotly 4.10.4 2024-01-13 [1] RSPM (R 4.4.0) #> plyr 1.8.9 2023-10-02 [1] RSPM (R 4.4.0) #> png 0.1-8 2022-11-29 [1] RSPM (R 4.4.0) #> Polychrome * 1.5.1 2022-05-03 [1] RSPM (R 4.4.0) #> postcards * 0.2.3 2022-01-07 [1] RSPM (R 4.4.0) #> praise 1.0.0 2015-08-11 [2] RSPM (R 4.4.0) #> prettyunits 1.2.0 2023-09-24 [2] RSPM (R 4.4.0) #> promises 1.3.0 2024-04-05 [2] RSPM (R 4.4.0) #> ProtGenerics 1.36.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> purrr 1.0.2 2023-08-10 [2] RSPM (R 4.4.0) #> R.cache 0.16.0 2022-07-21 [1] RSPM (R 4.4.0) #> R.methodsS3 1.8.2 2022-06-13 [1] RSPM (R 4.4.0) #> R.oo 1.26.0 2024-01-24 [1] RSPM (R 4.4.0) #> R.utils 2.12.3 2023-11-18 [1] RSPM (R 4.4.0) #> R6 2.5.1 2021-08-19 [2] RSPM (R 4.4.0) #> rappdirs 0.3.3 2021-01-31 [2] RSPM (R 4.4.0) #> rbibutils 2.2.16 2023-10-25 [1] RSPM (R 4.4.0) #> RColorBrewer * 1.1-3 2022-04-03 [1] RSPM (R 4.4.0) #> Rcpp 1.0.12 2024-01-09 [2] RSPM (R 4.4.0) #> RCurl 1.98-1.14 2024-01-09 [1] RSPM (R 4.4.0) #> Rdpack 2.6 2023-11-08 [1] RSPM (R 4.4.0) #> recount3 * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> RefManageR * 1.4.0 2022-09-30 [1] RSPM (R 4.4.0) #> remaCor 0.0.18 2024-02-08 [1] RSPM (R 4.4.0) #> rematch2 2.1.2 2020-05-01 [2] RSPM (R 4.4.0) #> reshape2 1.4.4 2020-04-09 [1] RSPM (R 4.4.0) #> restfulr 0.0.15 2022-06-16 [1] RSPM (R 4.4.0) #> rhdf5 2.48.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> rhdf5filters 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> Rhdf5lib 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> RhpcBLASctl 0.23-42 2023-02-11 [1] RSPM (R 4.4.0) #> rintrojs 0.3.4 2024-01-11 [1] RSPM (R 4.4.0) #> rjson 0.2.21 2022-01-09 [1] RSPM (R 4.4.0) #> rlang * 1.1.4 2024-06-04 [2] RSPM (R 4.4.0) #> rmarkdown 2.27 2024-05-17 [2] RSPM (R 4.4.0) #> rpart 4.1.23 2023-12-05 [3] CRAN (R 4.4.0) #> rprojroot 2.0.4 2023-11-05 [2] RSPM (R 4.4.0) #> Rsamtools 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> RSQLite 2.3.7 2024-05-27 [1] RSPM (R 4.4.0) #> rstudioapi 0.16.0 2024-03-24 [2] RSPM (R 4.4.0) #> rsvd 1.0.5 2021-04-16 [1] RSPM (R 4.4.0) #> rtracklayer 1.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> Rtsne 0.17 2023-12-07 [1] RSPM (R 4.4.0) #> S4Arrays 1.4.1 2024-05-20 [1] Bioconductor 3.19 (R 4.4.0) #> S4Vectors * 0.42.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> sass 0.4.9 2024-03-15 [2] RSPM (R 4.4.0) #> ScaledMatrix 1.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> scales 1.3.0 2023-11-28 [1] RSPM (R 4.4.0) #> scater * 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> scatterplot3d 0.3-44 2023-05-05 [1] RSPM (R 4.4.0) #> scran * 1.32.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> scRNAseq * 2.18.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.0) #> scuttle * 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> segmented 2.1-0 2024-05-14 [1] RSPM (R 4.4.0) #> sessioninfo * 1.2.2 2021-12-06 [2] RSPM (R 4.4.0) #> shape 1.4.6.1 2024-02-23 [1] RSPM (R 4.4.0) #> shiny 1.8.1.1 2024-04-02 [2] RSPM (R 4.4.0) #> shinyAce 0.4.2 2022-05-06 [1] RSPM (R 4.4.0) #> shinydashboard 0.7.2 2021-09-30 [1] RSPM (R 4.4.0) #> shinyjs 2.1.0 2021-12-23 [1] RSPM (R 4.4.0) #> shinyWidgets 0.8.6 2024-04-24 [1] RSPM (R 4.4.0) #> SingleCellExperiment * 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> SingleR * 2.6.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> smokingMouse * 0.99.91 2024-06-11 [1] Github (LieberInstitute/smokingMouse@96d8480) #> spam 2.10-0 2023-10-23 [1] RSPM (R 4.4.0) #> SparseArray 1.4.8 2024-05-24 [1] Bioconductor 3.19 (R 4.4.0) #> sparseMatrixStats 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> SpatialExperiment * 1.14.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> spatialLIBD * 1.16.2 2024-05-28 [1] Bioconductor 3.19 (R 4.4.0) #> statmod 1.5.0 2023-01-06 [1] RSPM (R 4.4.0) #> stringi 1.8.4 2024-05-06 [2] RSPM (R 4.4.0) #> stringr * 1.5.1 2023-11-14 [2] RSPM (R 4.4.0) #> styler 1.10.3 2024-04-07 [1] RSPM (R 4.4.0) #> SummarizedExperiment * 1.34.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.0) #> survival 3.7-0 2024-06-05 [3] RSPM (R 4.4.0) #> sys 3.4.2 2023-05-23 [2] RSPM (R 4.4.0) #> testthat * 3.2.1.1 2024-04-14 [2] RSPM (R 4.4.0) #> tibble 3.2.1 2023-03-20 [2] RSPM (R 4.4.0) #> tidyr 1.3.1 2024-01-24 [1] RSPM (R 4.4.0) #> tidyselect 1.2.1 2024-03-11 [1] RSPM (R 4.4.0) #> timechange 0.3.0 2024-01-18 [1] RSPM (R 4.4.0) #> UCSC.utils 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> usethis * 2.2.3 2024-02-19 [2] RSPM (R 4.4.0) #> utf8 1.2.4 2023-10-22 [2] RSPM (R 4.4.0) #> uwot 0.2.2 2024-04-21 [1] RSPM (R 4.4.0) #> variancePartition * 1.34.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> vctrs 0.6.5 2023-12-01 [2] RSPM (R 4.4.0) #> vipor 0.4.7 2023-12-18 [1] RSPM (R 4.4.0) #> viridis 0.6.5 2024-01-29 [1] RSPM (R 4.4.0) #> viridisLite 0.4.2 2023-05-02 [1] RSPM (R 4.4.0) #> whisker 0.4.1 2022-12-05 [2] RSPM (R 4.4.0) #> withr 3.0.0 2024-01-16 [2] RSPM (R 4.4.0) #> xfun 0.44 2024-05-15 [2] RSPM (R 4.4.0) #> XML * 3.99-0.16.1 2024-01-22 [1] RSPM (R 4.4.0) #> xml2 1.3.6 2023-12-04 [2] RSPM (R 4.4.0) #> xtable 1.8-4 2019-04-21 [2] RSPM (R 4.4.0) #> XVector 0.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> yaml 2.3.8 2023-12-11 [2] RSPM (R 4.4.0) #> zlibbioc 1.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.0) #> #> [1] /__w/_temp/Library #> [2] /usr/local/lib/R/site-library #> [3] /usr/local/lib/R/library #> #> ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── System curl version information: curl::curl_version() #> $version #> [1] "7.81.0" #> #> $ssl_version #> [1] "OpenSSL/3.0.2" #> #> $libz_version #> [1] "1.2.11" #> #> $libssh_version #> [1] "libssh/0.9.6/openssl/zlib" #> #> $libidn_version #> [1] "2.3.2" #> #> $host #> [1] "x86_64-pc-linux-gnu" #> #> $protocols #> [1] "dict" "file" "ftp" "ftps" "gopher" "gophers" "http" "https" "imap" "imaps" "ldap" #> [12] "ldaps" "mqtt" "pop3" "pop3s" "rtmp" "rtsp" "scp" "sftp" "smb" "smbs" "smtp" #> [23] "smtps" "telnet" "tftp" #> #> $ipv6 #> [1] TRUE #> #> $http2 #> [1] TRUE #> #> $idn #> [1] TRUE This interactive book was last updated at 2024-06-11 11:00:51.293221. "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]
diff --git a/summarizedexperiment-overview.html b/summarizedexperiment-overview.html
index 0b5e087..7735c67 100644
--- a/summarizedexperiment-overview.html
+++ b/summarizedexperiment-overview.html
@@ -314,8 +314,25 @@
11.8.3 Hierarchical clustering
11.8.4 Subclustering
-11.9 Marker gene detection
-11.10 Cell type annotation
+11.9 Marker gene detection
+
+11.10 Cell type annotation
+
+11.11 Getting ready again
+
+11.12 References
12 Introduction to spatial transcriptomics
@@ -363,7 +380,7 @@
- 14.6.4 Edit with Your Information
- 14.6.5 Deploy the Page
-14.7 References
+14.7 References
Final R Session