Fixed outstanding issues. Double check and push new release in next c…

…ommit
noush-joglekar · Sep 22, 2021 · 9662cb3 · 9662cb3
1 parent 7afecd9
commit 9662cb3
Show file tree

Hide file tree

Showing 17 changed files with 547 additions and 703 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,3 +2,4 @@
 ^\.Rproj\.user$
 
 ^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@
 .Rhistory
 .RData
 .Ruserdata
+doc
+Meta
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: scisorseqr
 Type: Package
 Title: Processes long reads for differential isoform analysis
-Version: 0.1.1
+Version: 0.1.2
 Author: Anoushka Joglekar
 Maintainer: Anoushka Joglekar <anj2026@med.cornell.edu>
 Description: For any comparative (e.g. case-control) studies of alternative splicing,
-    scisorseqr is the one stop shop for analyzing single-cell long read data. The linux 
+    scisorseqr is the one stop shop for analyzing single-cell long read data. The linux
     based package includes functions for barcode deconvolution from fastqs,
     integration with long read alignment tools like STARlong and minimap2,
     mapping and filtering of high confidence full-length spliced reads, and some
@@ -27,7 +27,7 @@ Suggests:
     utils,
     methods,
     spelling,
-    knitr, 
+    knitr,
     rmarkdown,
     ggplot2,
     cowplot

diff --git a/NAMESPACE b/NAMESPACE
@@ -17,5 +17,6 @@ export(triHeatmap)
 import(cowplot)
 import(dplyr)
 import(ggplot2)
+import(parallel)
 import(scales)
 importFrom(magrittr,"%>%")
diff --git a/R/ExonQuant.R b/R/ExonQuant.R
@@ -6,31 +6,32 @@
 #'
 #' For ONT data we recomment \url{https://github.com/ablab/IsoQuant.git}
 #' which allows for non-exact splice-site matching
-#' @param allInfoFile file containing cell barcode and celltype information
+#' @param allInfoFile file containing barcode, celltype, and exon information
 #' per read. Defaults to output of the InfoPerLongRead function
-#' @param exon.gff gzipped file containing exon mapping information.
-#' Defaults to output from MapAndFilter
 #' @param groupingFactor "Celltype" or "Barcode" to group reads by for
-#' counting inclusion levels. Defaults to celltype
+#' counting inclusion levels. Defaults to Celltype
 #' @param threshold minimum number of reads per grouping factor in order
 #' to consider that exon to be sufficiently expressed. Defaults to 10
 #' @param threads number of threads to parallelize the function. Defaults
 #' to 4
 #' @seealso \code{\link{MapAndFilter}}
 #' @seealso \code{\link{InfoPerLongRead}}
 #' @return ExonQuantOutput/InclusionExclusionCounts.tsv
+#' 
+#' @import dplyr
+#' @import parallel
 #' @export
-#'
 
-ExonQuant <- function(allInfoFile = 'LongReadInfo/AllInfo',
-                           exon.gff = 'LRProcessingOutput/mapping.bestperRead.RNAdirection.withConsensIntrons.gff.gz',
+ExonQuant <- function(allInfoFile = 'LongReadInfo/AllInfo_IncompleteReads',
                            groupingFactor = "Celltype",threshold = 10, numThreads = 4) {
 
   if(!dir.exists('ExonQuantOutput')){dir.create("ExonQuantOutput/")}
+  if(file.exists('ExonQuantOutput/InclusionExclusionCounts.tsv')){
+    file.remove('ExonQuantOutput/InclusionExclusionCounts.tsv')}
 
   R_file <- system.file("RScript", "ExonCounting.R", package = "scisorseqr")
 
-  countExons <- paste("Rscript", R_file, allInfoFile, exon.gff, threshold, groupingFactor, numThreads)
+  countExons <- paste("Rscript", R_file, allInfoFile, groupingFactor, numThreads, threshold)
   system(countExons)
 
 }
diff --git a/R/InfoPerLongRead.R b/R/InfoPerLongRead.R
@@ -1,9 +1,15 @@
-#' Combine all information per read to a flat file
+#' Combine all available information per read to a flat file
 #' @aliases LongReadInfo
 #' @description Function to concatenate all the information per read, i.e
 #' gene-name, cellular barcode, UMI, cell-type information, and isoform
-#' information into one file. Also outputs basic stats such as number of
-#' reads/ genes/ UMIs per cellular barcode
+#' information into one file. Isoform information includes a string separated list of 
+#' introns, Cage and PolyA peak information if available, and a 
+#' string separated list of exons. 
+#' AllInfo contains the list of complete and full-length mapped, spliced, and barcoded reads.
+#' AllInfo_Incomplete is a superset of the above, and also contains reads that are not
+#' classified as complete based on Cage/PolyA data. 
+#' The function also outputs basic stats 
+#' such as number of reads/ genes/ UMIs per cellular barcode
 #' @seealso \code{\link{MapAndFilter}}
 #' @seealso \code{\link{GetBarcodes}}
 #' @param barcodeOutputFile .csv file containing barcode and cell-type information
@@ -13,8 +19,10 @@
 #' to that output, else it uses the canonically spliced full-length reads
 #' @param minTimesIsoObserve minimum number of times an isoform is observed in the
 #' dataset. Defaults to 5
+#' @param rmTmpFolder Logical indicating whether you want to delete contents of the 
+#' pre-processing folder. Defaults to TRUE
 #' @export
-InfoPerLongRead <- function(barcodeOutputFile, mapAndFilterOut, minTimesIsoObserve = 5) {
+InfoPerLongRead <- function(barcodeOutputFile, mapAndFilterOut, minTimesIsoObserve = 5, rmTmpFolder = TRUE) {
   longReadInfo_sh <- system.file("bash", "longReadInfo.sh", package = "scisorseqr")
 
   longReadInfoFolder <- "LongReadInfo/"
@@ -35,4 +43,8 @@ InfoPerLongRead <- function(barcodeOutputFile, mapAndFilterOut, minTimesIsoObser
           exonFile, minTimesIsoObserve, incompFile)
   system(longReadInfoComm)
 
+  removeDirComm <- paste("rm -r",mapAndFilterOut)
+
+  if(rmTmpFolder == TRUE){system(removeDirComm)}
+
 }
diff --git a/R/sigSplitPie.R b/R/sigSplitPie.R
@@ -6,6 +6,7 @@
 #' @return Directory labelled 'Visualizations' containing a pie chart containing
 #' the breakup of number of isoforms needed to cross deltaPI threshold
 #' of pairwise comparisons containing cell groups provided in input
+#' Legend indicates the number of isoforms needed to cross the deltaPI = 0.1 threshold
 #' @usage sigSplitPie('compDir/')
 #'
 #' @import dplyr

diff --git a/inst/RScript/ExonCounting.R b/inst/RScript/ExonCounting.R
@@ -1,68 +1,63 @@
-#' @import dplyr
 #' @importFrom magrittr %>%
+#' @import parallel
+#' @import dplyr
+#' @export
 
 `%>%` <- magrittr::`%>%`
 
 args <- commandArgs(trailingOnly=TRUE)
-## arguments : AllInfo file, exonInfo.gff, threshold, grouping factor
-
+## arguments : AllInfo file, groupingFactor, nThreads, threshold
 
 allInfo <- data.table::fread(args[1],sep="\t")
-allInfo <- allInfo[,1:6] ## Read in file with read - gene - celltype assignments
-colnames(allInfo) <- c("Read","Gene","Celltype","Barcode","UMI","Stretch")
-groupingFactor <- args[4]
-
-geneInfo <- allInfo %>% dplyr::select(Read,Gene,groupingFactor)
-
-print("Reading in GFF file, will take a while")
-
-exonGFF <- read.table(gzfile(args[2]))[,-c(2,3,6,8,9,11)]	## GFF file with exon info
-colnames(exonGFF) <- c("chr","start","end","strand","readname")
-exonGFF <- exonGFF %>% tidyr::separate(readname,into=c("Read","path"),sep=".path") %>%
-  dplyr::select(-path) %>% tidyr::unite(exon, c(chr,start,end,strand),sep="_",remove=FALSE)
-exonGFF <- dplyr::right_join(exonGFF,geneInfo) ## Merge gene information with GFF in case of multi-mapping exons
-
-threshold <- as.integer(args[3])
-numThreads <- as.integer(args[5])
-
-
-uniqExons <- exonGFF %>% dplyr::select(-c(Read,groupingFactor)) %>% dplyr::distinct()		## Get unique exons to iterate over
-edges <- exonGFF %>% dplyr::group_by(Read) %>% dplyr::mutate(s=min(start),e=max(end)) %>%
-  dplyr::select(Read,Gene,s,e,exon,groupingFactor) %>% dplyr::distinct() %>% dplyr::as_data_frame()
-print(paste0("About to start processing exons per ",groupingFactor))
-
-calcPSI <- function(x){		## x for line in uniqExons
-	geneDF <- edges %>% dplyr::filter(Gene == uniqExons$Gene[x]) %>% dplyr::group_by(.dots = groupingFactor) %>%
-	  dplyr::filter(s <= uniqExons$start[x] & e >= uniqExons$end[x]) %>% as.data.frame()
-	numReads <- geneDF %>% dplyr::group_by(.dots = groupingFactor, Read) %>% dplyr::select(Read,groupingFactor) %>%
-	  dplyr::ungroup() %>% dplyr::group_by(.dots = groupingFactor) %>% dplyr::distinct() %>%
-	  dplyr::add_count() %>% dplyr::filter(n>=threshold) %>% as.data.frame()
-	geneDF <- geneDF %>% dplyr::filter(Read %in% numReads$Read)  %>% as.data.frame()
-	u_gf <- geneDF %>% dplyr::select(groupingFactor) %>% dplyr::distinct()	## make list of unique grouping factors
-	psiGF <- NULL
-	for(gf in u_gf[,1]){	## If total reads spanning exon more than sampling rate, extract those reads
-		reads <- geneDF %>% dplyr::filter(get(groupingFactor) == gf)
-		psi = NULL
-		sr <- reads %>% dplyr::select(Read) %>% dplyr::distinct()
-		inc <- reads %>% dplyr::filter(Read %in% sr$Read) %>%
-		  dplyr::filter(exon == uniqExons$exon[x]) %>% nrow()
-		psi <- inc/nrow(sr)	## Total reads spanning the exon = sr
-		psi <- as.data.frame(psi)
-		psi$inclusion <- inc
-		psi$exclusion <- nrow(sr) - inc
-		psi$exon <- uniqExons$exon[x]
-		psi$Gene <- uniqExons$Gene[x]
-		psi[,as.character(groupingFactor)] <- gf
-		psiGF <- rbind(psiGF,psi) }
-	return(psiGF)
+if(ncol(allInfo) == 11){
+        allInfo <- allInfo[,c(1:4,9)]
+} else { allInfo <- allInfo[,c(1:4,7)]}  ## Read in file with read - gene - celltype assignments
+
+colnames(allInfo) <- c("Read","Gene","Celltype","Barcode","Exons")
+groupingFactor <- args[2]
+
+nThreads <- as.integer(args[3])
+rpg <- as.integer(args[4])
+
+outDir <- 'ExonQuantOutput/'
+
+allInfo_SE <- allInfo %>% dplyr::group_by(Gene) %>% dplyr::add_count() %>%
+        dplyr::filter(n >= rpg) %>% dplyr::select(-n) %>% dplyr::rowwise() %>%
+        dplyr::mutate(start = unlist(strsplit(Exons,"_"))[2],
+                end = rev(unlist(strsplit(Exons,"_")))[2]) %>%
+        as.data.frame()
+
+internalExons = allInfo_SE %>% tidyr::separate_rows(Exons,sep = ";%;") %>% dplyr::filter(Exons != "") %>%
+        dplyr::group_by(Read) %>% dplyr::add_count() %>% dplyr::filter(n>=3) %>% dplyr::slice(2:(dplyr::n()-1))
+
+uniqExons <- internalExons %>% dplyr::ungroup() %>% dplyr::select(Exons, Gene) %>% dplyr::distinct()
+
+inclusionCounts <- internalExons %>% dplyr::ungroup() %>% dplyr::select(Exons,Gene,all_of(groupingFactor)) %>%
+        dplyr::group_by(Exons,Gene,.dots=groupingFactor) %>% dplyr::add_count(name = "Inclusion") %>%
+        dplyr::distinct() %>% as.data.frame()
+
+readSE <- internalExons %>% dplyr::select(Read,Gene,all_of(groupingFactor),start,end) %>% dplyr::distinct() %>% as.data.frame()
+
+checkSpanningReads <- function(gene){
+        exons <- uniqExons %>% dplyr::filter(Gene == gene) %>%
+                tidyr::separate(Exons,into=c("chr","s","e","strand"),sep = "_",remove=FALSE)
+        reads <- readSE %>% dplyr::filter(Gene == gene)
+        spanningReads <- dplyr::left_join(reads,exons,by = "Gene") %>% dplyr::filter(s >= start && e <= end) %>%
+                dplyr::select(Exons,Gene,all_of(groupingFactor)) %>% dplyr::group_by(Exons,Gene,.dots = groupingFactor) %>%
+                dplyr::add_count(name = "Total") %>% dplyr::distinct() %>% as.data.frame()
+        inclusionReads <- inclusionCounts %>% dplyr::filter(Gene == gene)
+        inc_tot <- dplyr::right_join(inclusionReads,spanningReads,by = c('Exons',"Gene",groupingFactor)) %>%
+                replace(is.na(.),0) %>%
+                dplyr::mutate(PSI = Inclusion/Total, Exclusion = Total-Inclusion) %>% as.data.frame()
+        fName <- file.path(outDir,"InclusionExclusionCounts.tsv")
+        if(file.exists(fName)){
+                write.table(inc_tot,file.path(outDir,"InclusionExclusionCounts.tsv"),sep ="\t",
+                        append= T, quote = F, row.names = F, col.names = FALSE)
+        } else{
+               	write.table(inc_tot,file.path(outDir,"InclusionExclusionCounts.tsv"),sep ="\t",
+                        quote = F, row.names = F, col.names = TRUE)
+        }
+	return
 }
 
-
-sampledPSI <- parallel::mclapply(1:nrow(uniqExons), function(x) calcPSI(x), mc.cores=numThreads)
-#sampledPSI <- parallel::mclapply(1:200, function(x) calcPSI(x), mc.cores=28) ## testing purposes
-
-print("Converting to dataframe")
-psiDF <- dplyr::bind_rows(sampledPSI) %>% dplyr::select(exon,Gene,groupingFactor,inclusion,exclusion,psi)
-
-write.table(psiDF, file="ExonQuantOutput/InclusionExclusionCounts.tsv",
-        sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
+byGene = parallel::mclapply(unique(uniqExons$Gene), function(g) checkSpanningReads(g), mc.cores = nThreads)
diff --git a/inst/RScript/PieChart_Viz.R b/inst/RScript/PieChart_Viz.R
@@ -3,6 +3,11 @@
 #' @import scales
 #' @importFrom magrittr %>%
 
+#suppressPackageStartupMessages({
+#  library(dplyr)
+#  library(ggplot2)
+#})
+
 `%>%` <- magrittr::`%>%`
 
 args <- commandArgs(trailingOnly=TRUE)
@@ -51,19 +56,18 @@ nonSig <- length(which(resultsFile$FDR>0.05))
 df <- as.data.frame(table(breakup))
 df$breakup <- as.character(df$breakup)
 df <- rbind(c("NonSig",nonSig),df)
-df$value <- as.numeric(df$Freq)/total
+df$value <- as.numeric(df$Freq)*100/total
 
 
 df <- df %>% dplyr::mutate(ypos = cumsum(value)-0.5*value)
 cols <- c('NonSig'="#F9C6BA",'1'="#930077",'2'="#DD6892",'3'="#3c6f9c",'4'="#29cdb5",'5'="#048998",'6'="#553c8b")
 
-final_pie <- ggplot2::ggplot(df, aes(x="", y=value, fill=breakup)) +
-  geom_bar(stat="identity", width=1) +
-  coord_polar("y", start=0) +
-  theme_void() +
-  theme(legend.position="none") +
-  geom_text(aes(y = ypos, label = percent(value)), color = "white", size=6) +
-  scale_fill_manual("",values=cols)
+final_pie <- ggplot2::ggplot(df, ggplot2::aes(x="", y=value, fill=breakup)) +
+  ggplot2::geom_bar(stat="identity", width=1) +
+  ggplot2::coord_polar("y", start=0) +
+  ggplot2::theme_void() +
+  ggplot2::geom_text(ggplot2::aes(y = ypos, label = format(value,digits=2)), color = "white", size=6) +
+  ggplot2::scale_fill_manual("",values=cols)
 
 
 pdf(file=paste0("Visualizations/PieCharts_BreakUp_",name,".pdf"),8,6)

diff --git a/inst/python/Get_IsoformXClusterMat.py b/inst/python/Get_IsoformXClusterMat.py
@@ -7,10 +7,12 @@
 start_time = time.time()
 
 print("Creating Isoform X cell matrix")
-print("Processing file with ",len(all_info)," entries")
 
 input_file = sys.argv[1]
 all_info = [x.strip('\n').split('\t') for x in open(input_file).readlines()][1:]
+
+print("Processing file with ",len(all_info)," entries")
+
 gene_iso_names = [x[0] for x in all_info]
 cluster_names = [x[1] for x in all_info]
 num_iso = [int(x[2]) for x in all_info]

diff --git a/man/ExonQuant.Rd b/man/ExonQuant.Rd
diff --git a/man/InfoPerLongRead.Rd b/man/InfoPerLongRead.Rd
diff --git a/man/sigSplitPie.Rd b/man/sigSplitPie.Rd