Skip to content

Commit

Permalink
Fixed outstanding issues. Double check and push new release in next c…
Browse files Browse the repository at this point in the history
…ommit
  • Loading branch information
Anoushka committed Sep 22, 2021
1 parent 7afecd9 commit 9662cb3
Show file tree
Hide file tree
Showing 17 changed files with 547 additions and 703 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
^\.Rproj\.user$

^doc$
^Meta$
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
.Rhistory
.RData
.Ruserdata
doc
Meta
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
Package: scisorseqr
Type: Package
Title: Processes long reads for differential isoform analysis
Version: 0.1.1
Version: 0.1.2
Author: Anoushka Joglekar
Maintainer: Anoushka Joglekar <anj2026@med.cornell.edu>
Description: For any comparative (e.g. case-control) studies of alternative splicing,
scisorseqr is the one stop shop for analyzing single-cell long read data. The linux
scisorseqr is the one stop shop for analyzing single-cell long read data. The linux
based package includes functions for barcode deconvolution from fastqs,
integration with long read alignment tools like STARlong and minimap2,
mapping and filtering of high confidence full-length spliced reads, and some
Expand All @@ -27,7 +27,7 @@ Suggests:
utils,
methods,
spelling,
knitr,
knitr,
rmarkdown,
ggplot2,
cowplot
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ export(triHeatmap)
import(cowplot)
import(dplyr)
import(ggplot2)
import(parallel)
import(scales)
importFrom(magrittr,"%>%")
17 changes: 9 additions & 8 deletions R/ExonQuant.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,32 @@
#'
#' For ONT data we recomment \url{https://github.com/ablab/IsoQuant.git}
#' which allows for non-exact splice-site matching
#' @param allInfoFile file containing cell barcode and celltype information
#' @param allInfoFile file containing barcode, celltype, and exon information
#' per read. Defaults to output of the InfoPerLongRead function
#' @param exon.gff gzipped file containing exon mapping information.
#' Defaults to output from MapAndFilter
#' @param groupingFactor "Celltype" or "Barcode" to group reads by for
#' counting inclusion levels. Defaults to celltype
#' counting inclusion levels. Defaults to Celltype
#' @param threshold minimum number of reads per grouping factor in order
#' to consider that exon to be sufficiently expressed. Defaults to 10
#' @param threads number of threads to parallelize the function. Defaults
#' to 4
#' @seealso \code{\link{MapAndFilter}}
#' @seealso \code{\link{InfoPerLongRead}}
#' @return ExonQuantOutput/InclusionExclusionCounts.tsv
#'
#' @import dplyr
#' @import parallel
#' @export
#'

ExonQuant <- function(allInfoFile = 'LongReadInfo/AllInfo',
exon.gff = 'LRProcessingOutput/mapping.bestperRead.RNAdirection.withConsensIntrons.gff.gz',
ExonQuant <- function(allInfoFile = 'LongReadInfo/AllInfo_IncompleteReads',
groupingFactor = "Celltype",threshold = 10, numThreads = 4) {

if(!dir.exists('ExonQuantOutput')){dir.create("ExonQuantOutput/")}
if(file.exists('ExonQuantOutput/InclusionExclusionCounts.tsv')){
file.remove('ExonQuantOutput/InclusionExclusionCounts.tsv')}

R_file <- system.file("RScript", "ExonCounting.R", package = "scisorseqr")

countExons <- paste("Rscript", R_file, allInfoFile, exon.gff, threshold, groupingFactor, numThreads)
countExons <- paste("Rscript", R_file, allInfoFile, groupingFactor, numThreads, threshold)
system(countExons)

}
20 changes: 16 additions & 4 deletions R/InfoPerLongRead.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
#' Combine all information per read to a flat file
#' Combine all available information per read to a flat file
#' @aliases LongReadInfo
#' @description Function to concatenate all the information per read, i.e
#' gene-name, cellular barcode, UMI, cell-type information, and isoform
#' information into one file. Also outputs basic stats such as number of
#' reads/ genes/ UMIs per cellular barcode
#' information into one file. Isoform information includes a string separated list of
#' introns, Cage and PolyA peak information if available, and a
#' string separated list of exons.
#' AllInfo contains the list of complete and full-length mapped, spliced, and barcoded reads.
#' AllInfo_Incomplete is a superset of the above, and also contains reads that are not
#' classified as complete based on Cage/PolyA data.
#' The function also outputs basic stats
#' such as number of reads/ genes/ UMIs per cellular barcode
#' @seealso \code{\link{MapAndFilter}}
#' @seealso \code{\link{GetBarcodes}}
#' @param barcodeOutputFile .csv file containing barcode and cell-type information
Expand All @@ -13,8 +19,10 @@
#' to that output, else it uses the canonically spliced full-length reads
#' @param minTimesIsoObserve minimum number of times an isoform is observed in the
#' dataset. Defaults to 5
#' @param rmTmpFolder Logical indicating whether you want to delete contents of the
#' pre-processing folder. Defaults to TRUE
#' @export
InfoPerLongRead <- function(barcodeOutputFile, mapAndFilterOut, minTimesIsoObserve = 5) {
InfoPerLongRead <- function(barcodeOutputFile, mapAndFilterOut, minTimesIsoObserve = 5, rmTmpFolder = TRUE) {
longReadInfo_sh <- system.file("bash", "longReadInfo.sh", package = "scisorseqr")

longReadInfoFolder <- "LongReadInfo/"
Expand All @@ -35,4 +43,8 @@ InfoPerLongRead <- function(barcodeOutputFile, mapAndFilterOut, minTimesIsoObser
exonFile, minTimesIsoObserve, incompFile)
system(longReadInfoComm)

removeDirComm <- paste("rm -r",mapAndFilterOut)

if(rmTmpFolder == TRUE){system(removeDirComm)}

}
1 change: 1 addition & 0 deletions R/sigSplitPie.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#' @return Directory labelled 'Visualizations' containing a pie chart containing
#' the breakup of number of isoforms needed to cross deltaPI threshold
#' of pairwise comparisons containing cell groups provided in input
#' Legend indicates the number of isoforms needed to cross the deltaPI = 0.1 threshold
#' @usage sigSplitPie('compDir/')
#'
#' @import dplyr
Expand Down
113 changes: 54 additions & 59 deletions inst/RScript/ExonCounting.R
Original file line number Diff line number Diff line change
@@ -1,68 +1,63 @@
#' @import dplyr
#' @importFrom magrittr %>%
#' @import parallel
#' @import dplyr
#' @export

`%>%` <- magrittr::`%>%`

args <- commandArgs(trailingOnly=TRUE)
## arguments : AllInfo file, exonInfo.gff, threshold, grouping factor

## arguments : AllInfo file, groupingFactor, nThreads, threshold

allInfo <- data.table::fread(args[1],sep="\t")
allInfo <- allInfo[,1:6] ## Read in file with read - gene - celltype assignments
colnames(allInfo) <- c("Read","Gene","Celltype","Barcode","UMI","Stretch")
groupingFactor <- args[4]

geneInfo <- allInfo %>% dplyr::select(Read,Gene,groupingFactor)

print("Reading in GFF file, will take a while")

exonGFF <- read.table(gzfile(args[2]))[,-c(2,3,6,8,9,11)] ## GFF file with exon info
colnames(exonGFF) <- c("chr","start","end","strand","readname")
exonGFF <- exonGFF %>% tidyr::separate(readname,into=c("Read","path"),sep=".path") %>%
dplyr::select(-path) %>% tidyr::unite(exon, c(chr,start,end,strand),sep="_",remove=FALSE)
exonGFF <- dplyr::right_join(exonGFF,geneInfo) ## Merge gene information with GFF in case of multi-mapping exons

threshold <- as.integer(args[3])
numThreads <- as.integer(args[5])


uniqExons <- exonGFF %>% dplyr::select(-c(Read,groupingFactor)) %>% dplyr::distinct() ## Get unique exons to iterate over
edges <- exonGFF %>% dplyr::group_by(Read) %>% dplyr::mutate(s=min(start),e=max(end)) %>%
dplyr::select(Read,Gene,s,e,exon,groupingFactor) %>% dplyr::distinct() %>% dplyr::as_data_frame()
print(paste0("About to start processing exons per ",groupingFactor))

calcPSI <- function(x){ ## x for line in uniqExons
geneDF <- edges %>% dplyr::filter(Gene == uniqExons$Gene[x]) %>% dplyr::group_by(.dots = groupingFactor) %>%
dplyr::filter(s <= uniqExons$start[x] & e >= uniqExons$end[x]) %>% as.data.frame()
numReads <- geneDF %>% dplyr::group_by(.dots = groupingFactor, Read) %>% dplyr::select(Read,groupingFactor) %>%
dplyr::ungroup() %>% dplyr::group_by(.dots = groupingFactor) %>% dplyr::distinct() %>%
dplyr::add_count() %>% dplyr::filter(n>=threshold) %>% as.data.frame()
geneDF <- geneDF %>% dplyr::filter(Read %in% numReads$Read) %>% as.data.frame()
u_gf <- geneDF %>% dplyr::select(groupingFactor) %>% dplyr::distinct() ## make list of unique grouping factors
psiGF <- NULL
for(gf in u_gf[,1]){ ## If total reads spanning exon more than sampling rate, extract those reads
reads <- geneDF %>% dplyr::filter(get(groupingFactor) == gf)
psi = NULL
sr <- reads %>% dplyr::select(Read) %>% dplyr::distinct()
inc <- reads %>% dplyr::filter(Read %in% sr$Read) %>%
dplyr::filter(exon == uniqExons$exon[x]) %>% nrow()
psi <- inc/nrow(sr) ## Total reads spanning the exon = sr
psi <- as.data.frame(psi)
psi$inclusion <- inc
psi$exclusion <- nrow(sr) - inc
psi$exon <- uniqExons$exon[x]
psi$Gene <- uniqExons$Gene[x]
psi[,as.character(groupingFactor)] <- gf
psiGF <- rbind(psiGF,psi) }
return(psiGF)
if(ncol(allInfo) == 11){
allInfo <- allInfo[,c(1:4,9)]
} else { allInfo <- allInfo[,c(1:4,7)]} ## Read in file with read - gene - celltype assignments

colnames(allInfo) <- c("Read","Gene","Celltype","Barcode","Exons")
groupingFactor <- args[2]

nThreads <- as.integer(args[3])
rpg <- as.integer(args[4])

outDir <- 'ExonQuantOutput/'

allInfo_SE <- allInfo %>% dplyr::group_by(Gene) %>% dplyr::add_count() %>%
dplyr::filter(n >= rpg) %>% dplyr::select(-n) %>% dplyr::rowwise() %>%
dplyr::mutate(start = unlist(strsplit(Exons,"_"))[2],
end = rev(unlist(strsplit(Exons,"_")))[2]) %>%
as.data.frame()

internalExons = allInfo_SE %>% tidyr::separate_rows(Exons,sep = ";%;") %>% dplyr::filter(Exons != "") %>%
dplyr::group_by(Read) %>% dplyr::add_count() %>% dplyr::filter(n>=3) %>% dplyr::slice(2:(dplyr::n()-1))

uniqExons <- internalExons %>% dplyr::ungroup() %>% dplyr::select(Exons, Gene) %>% dplyr::distinct()

inclusionCounts <- internalExons %>% dplyr::ungroup() %>% dplyr::select(Exons,Gene,all_of(groupingFactor)) %>%
dplyr::group_by(Exons,Gene,.dots=groupingFactor) %>% dplyr::add_count(name = "Inclusion") %>%
dplyr::distinct() %>% as.data.frame()

readSE <- internalExons %>% dplyr::select(Read,Gene,all_of(groupingFactor),start,end) %>% dplyr::distinct() %>% as.data.frame()

checkSpanningReads <- function(gene){
exons <- uniqExons %>% dplyr::filter(Gene == gene) %>%
tidyr::separate(Exons,into=c("chr","s","e","strand"),sep = "_",remove=FALSE)
reads <- readSE %>% dplyr::filter(Gene == gene)
spanningReads <- dplyr::left_join(reads,exons,by = "Gene") %>% dplyr::filter(s >= start && e <= end) %>%
dplyr::select(Exons,Gene,all_of(groupingFactor)) %>% dplyr::group_by(Exons,Gene,.dots = groupingFactor) %>%
dplyr::add_count(name = "Total") %>% dplyr::distinct() %>% as.data.frame()
inclusionReads <- inclusionCounts %>% dplyr::filter(Gene == gene)
inc_tot <- dplyr::right_join(inclusionReads,spanningReads,by = c('Exons',"Gene",groupingFactor)) %>%
replace(is.na(.),0) %>%
dplyr::mutate(PSI = Inclusion/Total, Exclusion = Total-Inclusion) %>% as.data.frame()
fName <- file.path(outDir,"InclusionExclusionCounts.tsv")
if(file.exists(fName)){
write.table(inc_tot,file.path(outDir,"InclusionExclusionCounts.tsv"),sep ="\t",
append= T, quote = F, row.names = F, col.names = FALSE)
} else{
write.table(inc_tot,file.path(outDir,"InclusionExclusionCounts.tsv"),sep ="\t",
quote = F, row.names = F, col.names = TRUE)
}
return
}


sampledPSI <- parallel::mclapply(1:nrow(uniqExons), function(x) calcPSI(x), mc.cores=numThreads)
#sampledPSI <- parallel::mclapply(1:200, function(x) calcPSI(x), mc.cores=28) ## testing purposes

print("Converting to dataframe")
psiDF <- dplyr::bind_rows(sampledPSI) %>% dplyr::select(exon,Gene,groupingFactor,inclusion,exclusion,psi)

write.table(psiDF, file="ExonQuantOutput/InclusionExclusionCounts.tsv",
sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
byGene = parallel::mclapply(unique(uniqExons$Gene), function(g) checkSpanningReads(g), mc.cores = nThreads)
20 changes: 12 additions & 8 deletions inst/RScript/PieChart_Viz.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
#' @import scales
#' @importFrom magrittr %>%

#suppressPackageStartupMessages({
# library(dplyr)
# library(ggplot2)
#})

`%>%` <- magrittr::`%>%`

args <- commandArgs(trailingOnly=TRUE)
Expand Down Expand Up @@ -51,19 +56,18 @@ nonSig <- length(which(resultsFile$FDR>0.05))
df <- as.data.frame(table(breakup))
df$breakup <- as.character(df$breakup)
df <- rbind(c("NonSig",nonSig),df)
df$value <- as.numeric(df$Freq)/total
df$value <- as.numeric(df$Freq)*100/total


df <- df %>% dplyr::mutate(ypos = cumsum(value)-0.5*value)
cols <- c('NonSig'="#F9C6BA",'1'="#930077",'2'="#DD6892",'3'="#3c6f9c",'4'="#29cdb5",'5'="#048998",'6'="#553c8b")

final_pie <- ggplot2::ggplot(df, aes(x="", y=value, fill=breakup)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
theme_void() +
theme(legend.position="none") +
geom_text(aes(y = ypos, label = percent(value)), color = "white", size=6) +
scale_fill_manual("",values=cols)
final_pie <- ggplot2::ggplot(df, ggplot2::aes(x="", y=value, fill=breakup)) +
ggplot2::geom_bar(stat="identity", width=1) +
ggplot2::coord_polar("y", start=0) +
ggplot2::theme_void() +
ggplot2::geom_text(ggplot2::aes(y = ypos, label = format(value,digits=2)), color = "white", size=6) +
ggplot2::scale_fill_manual("",values=cols)


pdf(file=paste0("Visualizations/PieCharts_BreakUp_",name,".pdf"),8,6)
Expand Down
4 changes: 3 additions & 1 deletion inst/python/Get_IsoformXClusterMat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
start_time = time.time()

print("Creating Isoform X cell matrix")
print("Processing file with ",len(all_info)," entries")

input_file = sys.argv[1]
all_info = [x.strip('\n').split('\t') for x in open(input_file).readlines()][1:]

print("Processing file with ",len(all_info)," entries")

gene_iso_names = [x[0] for x in all_info]
cluster_names = [x[1] for x in all_info]
num_iso = [int(x[2]) for x in all_info]
Expand Down
11 changes: 3 additions & 8 deletions man/ExonQuant.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 18 additions & 4 deletions man/InfoPerLongRead.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/sigSplitPie.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 9662cb3

Please sign in to comment.