Merge pull request #154 from cgat-developers/JS-current

Improvements to splicing, bamstats and rnaseq
cgat-developers · Nov 24, 2021 · 87fcdbc · 87fcdbc
2 parents deed012 + e9689ee
commit 87fcdbc
Show file tree

Hide file tree

Showing 9 changed files with 224 additions and 44 deletions.
diff --git a/cgatpipelines/Rtools/diffexonexpression.R b/cgatpipelines/Rtools/diffexonexpression.R
@@ -14,7 +14,7 @@
 #'
 #' -> todo: ideas
 
-## conda dependencies: r-cairo
+
 
 suppressMessages(library(futile.logger))
 suppressMessages(library(getopt))
@@ -84,7 +84,8 @@ run <- function(opt) {
   flog.info("Running DEXSeq")
   dxd = estimateSizeFactors(experiment)
   dxd = estimateDispersions(dxd)
-  dxd = testForDEU(dxd)
+  dxd = testForDEU(dxd, reducedModel = formula(opt$reducedmodel),
+                   fullModel =  formula(opt$model))
   dxd = estimateExonFoldChanges( dxd, fitExpToVar=opt$contrast)
   res = DEXSeqResults(dxd)
 
@@ -141,6 +142,53 @@ run <- function(opt) {
     plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
     end_plot()
   }
+
+  if(opt$perm >0){
+    flog.info("... performing permutations")
+    designperm <- colData(dxd)
+    dxdperm = dxd
+    x = 0
+    i = 1
+    y= list()
+    while(i <= opt$perm) {
+      flog.info(paste0("...... Permutation ", i))
+      designperm[, opt$contrast] = sample(designperm[, opt$contrast])
+      if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dxd)[,opt$contrast])[1]){
+        flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
+        next
+      }
+      colData(dxdperm) <- designperm
+      dxdperm = testForDEU(dxdperm)
+      dxdperm = estimateExonFoldChanges( dxdperm, fitExpToVar=opt$contrast)
+      resperm = DEXSeqResults(dxdperm)
+      x[i] = length(subset(resperm, padj < opt$alpha)$padj)
+      y[[i]] = dxdperm$group
+      i = i+1
+    }
+    flog.info(paste0("Result:   "  , x))
+    start_plot("Simulations", outdir=opt$outdir)
+    theme_set(theme_gray(base_size = 18))
+    sims = qplot(x,
+                 geom="histogram",
+                 breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"),
+                 main = "Histogram of DE experiments\n with random group labels", 
+                 xlab = "Number of differentially expressed exons",
+                 ylab = "Number of simulations") +
+      geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
+      theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
+    print(sims)
+    end_plot()
+    flog.info(paste0("... Permutation p value: ",
+                     length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
+    z <- list()
+    for(i in 0:length(x)){
+      z[i] <- paste( unlist(y[i]), collapse=' ')
+    }
+    z <- unlist(z)
+    df <- data.frame(number=x, combination=z)
+    df
+    write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv"))
+  }
 }
 
 main <- function() {
@@ -157,7 +205,14 @@ main <- function() {
       "--model",
       dest = "model",
       type = "character",
-      default = "~ group",
+      default = "~ sample + exon + group:exon",
+      help = paste("model formula for GLM")
+    ),
+    make_option(
+      "--reducedmodel",
+      dest = "reducedmodel",
+      type = "character",
+      default = "~ sample + exon",
       help = paste("model formula for GLM")
     ),
     make_option(
@@ -191,8 +246,8 @@ main <- function() {
     make_option(
       "--permute",
       dest = "perm",
-      type = "logical",
-      default = FALSE,
+      type = "integer",
+      default = 0,
       help = paste("number of permutations")
     ),
     make_option(

diff --git a/cgatpipelines/Rtools/diffexpression.R b/cgatpipelines/Rtools/diffexpression.R
@@ -1,10 +1,10 @@
-#' filtering single cell data based on QC metrics
+#' Differential expression analysis
 #'
 #' WARNING: This script is work-in-progress
 #' 
 #' Example usage:
 #' 
-#' cgat-singlecell sc_diffexpression --rds-filename=sce.rds --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
+#' Rscript PATH/TO/diffexpression.R --rds-filename=sce.rds --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
 #'
 #' `sce.rds` is a single cell experiment object after filtering
 #'
@@ -261,12 +261,19 @@ run <- function(opt) {
     i = 1
     y= list()
     while(i <= opt$perm) {
-      designperm[, opt$contrast] = as.factor(sample(levels(designperm[, opt$contrast]), length(designperm[, opt$contrast]), replace=TRUE))
-      if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dds)[,opt$contrast])[1]) next
+      flog.info(paste0("...... Permutation ", i))
+      # Code to only shuffle the group labels for groups in coef
+      tempdesign <- designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),]
+      designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),][,opt$contrast] = sample(tempdesign[,opt$contrast])
+      if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dds)[,opt$contrast])[1]){
+        flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
+        flog.info(paste0("......... Number of reference replicates is: ", sum(designperm$group == levels(designperm[, opt$contrast])[1]), ", but should be: ", table(colData(dds)[,opt$contrast])[1]))
+        next
+      }
       colData(ddsperm) <- designperm
       ddsperm <- DESeq(ddsperm)
       resperm <- results(ddsperm)
-      x[i] = length(subset(resperm, padj < 0.05)$padj)
+      x[i] = length(subset(resperm, padj < opt$alpha)$padj)
       y[[i]] = ddsperm$group
       i = i+1
     }
@@ -278,12 +285,12 @@ run <- function(opt) {
                  main = "Histogram of DE experiments\n with random group labels", 
                  xlab = "Number of differentially expressed genes",
                  ylab = "Number of simulations") +
-      geom_vline(xintercept = length(rownames(subset(res, padj < 0.05)))) +
+      geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
       theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
     print(sims)
     end_plot()
     flog.info(paste0("... Permutation p value: ",
-                     length(x[x < length(rownames(subset(res, padj < 0.05)))])/length(x)))
+                     length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
     z <- list()
     for(i in 0:length(x)){
       z[i] <- paste( unlist(y[i]), collapse=' ')

diff --git a/cgatpipelines/Rtools/filtercounts.R b/cgatpipelines/Rtools/filtercounts.R
@@ -3,7 +3,7 @@
 #' 
 #' Example usage:
 #' 
-#' cgatflow R readandfiltercounts 
+#' Rscript PATH/TO/filtercounts.R 
 #' 
 #' input: directory containing read count files or tsv file containing reads
 #' additional input variables: method used to generate file, model

diff --git a/cgatpipelines/tasks/bamstats.py b/cgatpipelines/tasks/bamstats.py
@@ -1293,7 +1293,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
     job_memory = picardmem
     picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
     job_threads = 3
-    infile, genome = infiles
+    infile, genome, rRNA_intervals = infiles
 
     if BamTools.getNumReads(infile) == 0:
         E.warn("no reads in %s - no metrics" % infile)
@@ -1307,5 +1307,6 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
     OUTPUT=%(outfile)s
     STRAND=%(strand)s
     VALIDATION_STRINGENCY=SILENT
+    RIBOSOMAL_INTERVALS=%(rRNA_intervals)s
     '''
     P.run(statement)
diff --git a/cgatpipelines/tasks/splicing.py b/cgatpipelines/tasks/splicing.py
@@ -54,6 +54,8 @@ class variables. Instead use an option string that can be set in
 
 import os
 import random
+import shutil
+import pandas as pd
 import cgat.BamTools.bamtools as BamTools
 import cgatcore.experiment as E
 import cgatpipelines.tasks.expression as Expression
@@ -98,6 +100,7 @@ def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
     with open(outdir + "/b2.txt", "w") as f:
         f.write(group2)
     readlength = BamTools.estimateTagSize(design.samples[0]+".bam")
+    tmpdir = P.get_temp_dir()
 
     statement = '''rMATS
     --b1 %(outdir)s/b1.txt
@@ -107,6 +110,7 @@ def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
     --readLength %(readlength)s
     --cstat %(pvalue)s
     --libType %(strand)s
+    --tmp %(tmpdir)s
     ''' % locals()
 
     # if Paired End Reads
@@ -117,10 +121,10 @@ def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
     > %(outdir)s/%(designfile)s.log
     '''
 
-    P.run(statement, job_condaenv="splicing")
+    P.run(statement)
 
 
-def rmats2sashimi(infile, designfile, FDR, outfile):
+def rmats2sashimi(infile, designfile, FDR, outfile, plotmax=20):
     '''Module to generate sashimi plots from rMATS output
 
     Module generates a statement to call rmats2sashimiplot and provides
@@ -161,17 +165,25 @@ def rmats2sashimi(infile, designfile, FDR, outfile):
     event = os.path.basename(os.path.normpath(outfile))
     if "MXE" in infile:
         column = "22"
+        sortby = "25"
     else:
         column = "20"
+        sortby = "23"
+
+    temp = pd.read_csv(infile, sep='\t')
+    temp = temp.dropna()
+    temp.sort_values(by=['FDR'], inplace=True)
+    temp = temp[abs(temp['IncLevelDifference']) > 0.2]
+    plotnum = min(int(len(temp[temp['FDR'] < float(FDR)])), plotmax)
+    infile = P.snip(infile)
+    temp.iloc[1:plotnum,:].to_csv("%s_plot.txt" % infile, sep='\t', index=False)
 
-    statement = '''cat
-    %(infile)s|grep -v NA|
-    awk '$%(column)s < %(FDR)s' > %(infile)s_sig.txt;
+    statement = '''
     rmats2sashimiplot
     --b1 %(group1)s
     --b2 %(group2)s
     -t %(event)s
-    -e %(infile)s_sig.txt
+    -e %(infile)s_plot.txt
     --l1 %(group1name)s
     --l2 %(group2name)s
     -o %(outfile)s

diff --git a/cgatpipelines/tools/pipeline_bamstats.py b/cgatpipelines/tools/pipeline_bamstats.py
@@ -244,6 +244,48 @@ def countReads(infile, outfile):
 
     P.run(statement)
 
+
+@transform(PARAMS["annotations_interface_geneset_all_gtf"],
+           regex("(\S+).gtf.gz"),
+           "rRNA_intervals.tsv")
+def buildRrnaIntervals(infile, outfile):
+    '''
+    Builds a reference list of rRNA intervals for Picard
+
+    Parameters
+    ----------
+    infile: str
+        path to the GTF file containing transcript and gene level annotations
+    genome_dir: str
+        :term: `PARAMS` the directory of the reference genome
+    genome: str
+        :term: `PARAMS` the filename of the reference genome (without .fa)
+    outfile: str
+        path to output file
+    '''
+
+    genome_index = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa.fai"))
+    tempfile = P.get_temp_filename()
+
+    statement = r'''
+    cut -f1,2 %(genome_index)s |
+    awk -F, '{$1="@SQ\tSN:" $1;}1' OFS='\t'|
+    awk 'BEGIN{OFS="\t"}$3="LN:"$3' 
+    > %(outfile)s;
+
+    zcat %(infile)s | grep 'gene_biotype "rRNA"' |
+        awk '$3 == "exon"'|cgat gtf2tsv --attributes-as-columns|
+        grep -v '#'| cut -f1,4,5,7,10|sed '1d'|
+        sort -k1V -k2n -k3n 
+        >> %(outfile)s
+    ''' % locals()
+
+    P.run(statement)
+
+
+
+
 #########################################################################
 # QC tasks start here
 #########################################################################
@@ -626,7 +668,8 @@ def buildTranscriptProfiles(infiles, outfile):
 @P.add_doc(bamstats.buildPicardRnaSeqMetrics)
 @transform(intBam,
            regex("BamFiles.dir/(.*).bam$"),
-           add_inputs(PARAMS["annotations_interface_ref_flat"]),
+           add_inputs(PARAMS["annotations_interface_ref_flat"],
+                      buildRrnaIntervals),
            r"Picard_stats.dir/\1.picard_rna_metrics")
 def buildPicardRnaSeqMetrics(infiles, outfile):
     '''Get duplicate stats from picard RNASeqMetrics '''
@@ -637,6 +680,8 @@ def buildPicardRnaSeqMetrics(infiles, outfile):
         strand = "FIRST_READ_TRANSCRIPTION_STRAND"
     else:
         strand = "NONE"
+
+
     bamstats.buildPicardRnaSeqMetrics(infiles, strand, outfile,
                                       PICARD_MEMORY)
 

diff --git a/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py b/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
@@ -1039,7 +1039,6 @@ def filterDESeq2(infiles, outfile, design_name, quantifier_name):
 
     counts, design = infiles
     transcripts, genes = counts
-    design_name = design_name.lower()
     quantifier_name = quantifier_name.lower()
     counts = "--counts-dir %s.dir" % quantifier_name     
     model = PARAMS.get('deseq2_model%s' % design_name, None)
@@ -1081,7 +1080,6 @@ def filterEdgeR(infiles, outfile, design_name, quantifier_name):
 
     counts, design = infiles
     transcripts, genes = counts
-    design_name = design_name.lower()
     quantifier_name = quantifier_name.lower()
     counts = "--counts-dir %s.dir" % quantifier_name     
     model = PARAMS.get('edger_model%s' % design_name, None)
@@ -1123,7 +1121,6 @@ def filterEdgeR(infiles, outfile, design_name, quantifier_name):
 def exploratoryAnalysis(infile, outfile, design_name, quantifier_name, detool_name):
 
 
-    design_name = design_name.lower()
     design = "design" + design_name + ".tsv"
     outdir = os.path.dirname(outfile)
     r_root = os.path.abspath(os.path.dirname(cgatpipelines.__file__))
@@ -1166,7 +1163,6 @@ def exploratoryAnalysis(infile, outfile, design_name, quantifier_name, detool_na
 def runDESeq2(infile, outfile, design_name, quantifier_name):
     ''' run DESeq2 to identify differentially expression transcripts/genes'''
 
-    design_name = design_name.lower()
     design = "design" + design_name + ".tsv"
     outdir = os.path.dirname(outfile)
     r_root = os.path.abspath(os.path.dirname(cgatpipelines.__file__))
@@ -1216,7 +1212,6 @@ def runDESeq2(infile, outfile, design_name, quantifier_name):
 def runEdgeR(infile, outfile, design_name, quantifier_name):
     ''' run EdgeR to identify differentially expression transcripts/genes'''
 
-    design_name = design_name.lower()
     design = "design" + design_name + ".tsv"
     outdir = os.path.dirname(outfile)
     r_root = os.path.abspath(os.path.dirname(cgatpipelines.__file__))