vignettes/signac_pbmc.Rmd

---
title: "Using scOpen in R"
author: "Zhijian Li"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output: html_document
---

## Introduction

### In this tutorial, we will show you how to use scOpen to perform dimension reduction in R. The data is from Signac tutorial https://satijalab.org/signac/articles/pbmc_vignette.html. In addition, we will also compare the results to LSI, which is the default dim. reduction method for scATAC-seq.

## Load package
```{r load_package, echo=FALSE}
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)
set.seed(1234)
```

## Dimension reduction using scOpen

### We first download the data using following commands
```{bash, echo=FALSE}
wget --no-verbose https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5
wget --no-verbose https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_singlecell.csv
wget --no-verbose https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz
wget --no-verbose https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz.tbi
```

### We then create a Seurat object for ATAC-seq data after quality control
```{r, fig.width=8, fig.height=4}
counts <- Read10X_h5(filename = "./atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5")
metadata <- read.csv(
  file = "./atac_v1_pbmc_10k_singlecell.csv",
  header = TRUE,
  row.names = 1
)

chrom_assay <- CreateChromatinAssay(
  counts = counts,
  sep = c(":", "-"),
  genome = 'hg19',
  fragments = './atac_v1_pbmc_10k_fragments.tsv.gz',
  min.cells = 10,
  min.features = 200
)

pbmc <- CreateSeuratObject(
  counts = chrom_assay,
  assay = "peaks",
  meta.data = metadata
)


# extract gene annotations from EnsDb
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)

# change to UCSC style since the data was mapped to hg19
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- "hg19"

# add the gene information to the object
Annotation(pbmc) <- annotations

# compute nucleosome signal score per cell
pbmc <- NucleosomeSignal(object = pbmc)

# compute TSS enrichment score per cell
pbmc <- TSSEnrichment(object = pbmc, fast = FALSE)

# add blacklist ratio and fraction of reads in peaks
pbmc$pct_reads_in_peaks <- pbmc$peak_region_fragments / pbmc$passed_filters * 100
pbmc$blacklist_ratio <- pbmc$blacklist_region_fragments / pbmc$peak_region_fragments

pbmc$high.tss <- ifelse(pbmc$TSS.enrichment > 2, 'High', 'Low')
TSSPlot(pbmc, group.by = 'high.tss') + NoLegend()

pbmc$nucleosome_group <- ifelse(pbmc$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
FragmentHistogram(object = pbmc, group.by = 'nucleosome_group')

VlnPlot(
  object = pbmc,
  features = c('pct_reads_in_peaks', 'peak_region_fragments',
               'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'),
  pt.size = 0.0,
  ncol = 5
)

pbmc <- subset(
  x = pbmc,
  subset = peak_region_fragments > 3000 &
    peak_region_fragments < 20000 &
    pct_reads_in_peaks > 15 &
    blacklist_ratio < 0.05 &
    nucleosome_signal < 4 &
    TSS.enrichment > 2
)
pbmc
```

### We next perform dimension reduction using scOpen. Note here we have to use reticulate package to allow us call python function in R.
```{r}
library(reticulate)
sc <- import("scopen")

matDR <- t(as.matrix(sc$Main$scopen_dr(pbmc@assays$peaks@counts)))

colnames(matDR) <- paste0("scOpen_", 1:ncol(matDR))
rownames(matDR) <- colnames(pbmc@assays$peaks@counts)

pbmc@reductions[['scOpen']] <- CreateDimReducObject(embeddings = matDR,
                                                    assay = DefaultAssay(pbmc))

DepthCor(pbmc, reduction = "scOpen", n = 20)
```

### Non-linear dimension reduction and clustering
```{r, fig.width=8, fig.width=8}
pbmc <- RunUMAP(object = pbmc, reduction = 'scOpen', dims = 1:30, reduction.name = "umap_scopen")
pbmc <- FindNeighbors(object = pbmc, reduction = 'scOpen', dims = 1:30)
pbmc <- FindClusters(object = pbmc, verbose = FALSE, algorithm = 3)
DimPlot(object = pbmc, label = TRUE, reduction = "umap_scopen")
```

### Create a gene activity matrix
```{r, fig.height=8, fig.width=12}
gene.activities <- GeneActivity(pbmc)

# add the gene activity matrix to the Seurat object as a new assay and normalize it
pbmc[['RNA']] <- CreateAssayObject(counts = gene.activities)
pbmc <- NormalizeData(
  object = pbmc,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(pbmc$nCount_RNA)
)

DefaultAssay(pbmc) <- 'RNA'

FeaturePlot(
  object = pbmc,
  features = c('MS4A1', 'CD3D', 'LEF1', 'NKG7', 'TREM1', 'LYZ'),
  pt.size = 0.1,
  max.cutoff = 'q95',
  ncol = 3,
  reduction = "umap_scopen"
)
```

### Download scRNA-seq data
```{bash, echo=FALSE}
wget --no-verbose https://www.dropbox.com/s/zn6khirjafoyyxl/pbmc_10k_v3.rds
```


### Integrating with scRNA-seq data
```{r, fig.width=12, fig.height=6}
# Load the pre-processed scRNA-seq data for PBMCs
pbmc_rna <- readRDS("./pbmc_10k_v3.rds")

transfer.anchors <- FindTransferAnchors(
  reference = pbmc_rna,
  query = pbmc,
  reduction = 'cca'
)

predicted.labels <- TransferData(
  anchorset = transfer.anchors,
  refdata = pbmc_rna$celltype,
  weight.reduction = pbmc[['scOpen']],
  dims = 1:30
)

pbmc <- AddMetaData(object = pbmc, metadata = predicted.labels)


p1 <- DimPlot(
  object = pbmc_rna,
  group.by = 'celltype',
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')

p2 <- DimPlot(
  object = pbmc,
  group.by = 'predicted.id',
  reduction = "umap_scopen",
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')

p1 + p2
```

### Evalute using silhouette score
```{r}
library(cluster)

df.dist <- 1 - cor(as.matrix(t(pbmc@reductions$scOpen@cell.embeddings)))

si <- silhouette(x = as.numeric(factor(pbmc@meta.data$predicted.id)), 
                     dmatrix = df.dist)

print(mean(si[, 'sil_width']))
```

## Dimension reduction using LSI
```{r, fig.height=6, fig.width=6}
DefaultAssay(pbmc) <- "peaks"

pbmc <- RunTFIDF(pbmc)
pbmc <- FindTopFeatures(pbmc, min.cutoff = 'q0')
pbmc <- RunSVD(pbmc)

DepthCor(pbmc, reduction = "lsi",  n = 20)

pbmc <- RunUMAP(object = pbmc, reduction = 'lsi', dims = 2:30, reduction.name = "umap_lsi")
pbmc <- FindNeighbors(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindClusters(object = pbmc, verbose = FALSE, algorithm = 3)
DimPlot(object = pbmc, label = TRUE, reduction = "umap_lsi") + NoLegend()

DefaultAssay(pbmc) <- 'RNA'

transfer.anchors <- FindTransferAnchors(
  reference = pbmc_rna,
  query = pbmc,
  reduction = 'cca'
)

predicted.labels <- TransferData(
  anchorset = transfer.anchors,
  refdata = pbmc_rna$celltype,
  weight.reduction = pbmc[['lsi']],
  dims = 2:30
)

pbmc <- AddMetaData(object = pbmc, metadata = predicted.labels)
```

### Visualize
```{r, fig.height=6, fig.width=12}
p1 <- DimPlot(
  object = pbmc_rna,
  group.by = 'celltype',
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')

p2 <- DimPlot(
  object = pbmc,
  group.by = 'predicted.id',
  reduction = "umap_lsi",
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')

p1 + p2

```


## Evalute using silhouette score
```{r}
library(cluster)

df.dist <- 1 - cor(as.matrix(t(pbmc@reductions$lsi@cell.embeddings)))

si <- silhouette(x = as.numeric(factor(pbmc@meta.data$predicted.id)), 
                 dmatrix = df.dist)

print(mean(si[, 'sil_width']))
```

## sessionInfo()
```{r}
sessionInfo()
```