Skip to content

Commit

Permalink
Merge pull request #154 from cgat-developers/JS-current
Browse files Browse the repository at this point in the history
Improvements to splicing, bamstats and rnaseq
  • Loading branch information
jscaber authored Nov 24, 2021
2 parents deed012 + e9689ee commit 87fcdbc
Show file tree
Hide file tree
Showing 9 changed files with 224 additions and 44 deletions.
65 changes: 60 additions & 5 deletions cgatpipelines/Rtools/diffexonexpression.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#'
#' -> todo: ideas

## conda dependencies: r-cairo


suppressMessages(library(futile.logger))
suppressMessages(library(getopt))
Expand Down Expand Up @@ -84,7 +84,8 @@ run <- function(opt) {
flog.info("Running DEXSeq")
dxd = estimateSizeFactors(experiment)
dxd = estimateDispersions(dxd)
dxd = testForDEU(dxd)
dxd = testForDEU(dxd, reducedModel = formula(opt$reducedmodel),
fullModel = formula(opt$model))
dxd = estimateExonFoldChanges( dxd, fitExpToVar=opt$contrast)
res = DEXSeqResults(dxd)

Expand Down Expand Up @@ -141,6 +142,53 @@ run <- function(opt) {
plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
end_plot()
}

if(opt$perm >0){
flog.info("... performing permutations")
designperm <- colData(dxd)
dxdperm = dxd
x = 0
i = 1
y= list()
while(i <= opt$perm) {
flog.info(paste0("...... Permutation ", i))
designperm[, opt$contrast] = sample(designperm[, opt$contrast])
if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dxd)[,opt$contrast])[1]){
flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
next
}
colData(dxdperm) <- designperm
dxdperm = testForDEU(dxdperm)
dxdperm = estimateExonFoldChanges( dxdperm, fitExpToVar=opt$contrast)
resperm = DEXSeqResults(dxdperm)
x[i] = length(subset(resperm, padj < opt$alpha)$padj)
y[[i]] = dxdperm$group
i = i+1
}
flog.info(paste0("Result: " , x))
start_plot("Simulations", outdir=opt$outdir)
theme_set(theme_gray(base_size = 18))
sims = qplot(x,
geom="histogram",
breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"),
main = "Histogram of DE experiments\n with random group labels",
xlab = "Number of differentially expressed exons",
ylab = "Number of simulations") +
geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
print(sims)
end_plot()
flog.info(paste0("... Permutation p value: ",
length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
z <- list()
for(i in 0:length(x)){
z[i] <- paste( unlist(y[i]), collapse=' ')
}
z <- unlist(z)
df <- data.frame(number=x, combination=z)
df
write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv"))
}
}

main <- function() {
Expand All @@ -157,7 +205,14 @@ main <- function() {
"--model",
dest = "model",
type = "character",
default = "~ group",
default = "~ sample + exon + group:exon",
help = paste("model formula for GLM")
),
make_option(
"--reducedmodel",
dest = "reducedmodel",
type = "character",
default = "~ sample + exon",
help = paste("model formula for GLM")
),
make_option(
Expand Down Expand Up @@ -191,8 +246,8 @@ main <- function() {
make_option(
"--permute",
dest = "perm",
type = "logical",
default = FALSE,
type = "integer",
default = 0,
help = paste("number of permutations")
),
make_option(
Expand Down
21 changes: 14 additions & 7 deletions cgatpipelines/Rtools/diffexpression.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#' filtering single cell data based on QC metrics
#' Differential expression analysis
#'
#' WARNING: This script is work-in-progress
#'
#' Example usage:
#'
#' cgat-singlecell sc_diffexpression --rds-filename=sce.rds --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
#' Rscript PATH/TO/diffexpression.R --rds-filename=sce.rds --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
#'
#' `sce.rds` is a single cell experiment object after filtering
#'
Expand Down Expand Up @@ -261,12 +261,19 @@ run <- function(opt) {
i = 1
y= list()
while(i <= opt$perm) {
designperm[, opt$contrast] = as.factor(sample(levels(designperm[, opt$contrast]), length(designperm[, opt$contrast]), replace=TRUE))
if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dds)[,opt$contrast])[1]) next
flog.info(paste0("...... Permutation ", i))
# Code to only shuffle the group labels for groups in coef
tempdesign <- designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),]
designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),][,opt$contrast] = sample(tempdesign[,opt$contrast])
if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dds)[,opt$contrast])[1]){
flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
flog.info(paste0("......... Number of reference replicates is: ", sum(designperm$group == levels(designperm[, opt$contrast])[1]), ", but should be: ", table(colData(dds)[,opt$contrast])[1]))
next
}
colData(ddsperm) <- designperm
ddsperm <- DESeq(ddsperm)
resperm <- results(ddsperm)
x[i] = length(subset(resperm, padj < 0.05)$padj)
x[i] = length(subset(resperm, padj < opt$alpha)$padj)
y[[i]] = ddsperm$group
i = i+1
}
Expand All @@ -278,12 +285,12 @@ run <- function(opt) {
main = "Histogram of DE experiments\n with random group labels",
xlab = "Number of differentially expressed genes",
ylab = "Number of simulations") +
geom_vline(xintercept = length(rownames(subset(res, padj < 0.05)))) +
geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
print(sims)
end_plot()
flog.info(paste0("... Permutation p value: ",
length(x[x < length(rownames(subset(res, padj < 0.05)))])/length(x)))
length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
z <- list()
for(i in 0:length(x)){
z[i] <- paste( unlist(y[i]), collapse=' ')
Expand Down
2 changes: 1 addition & 1 deletion cgatpipelines/Rtools/filtercounts.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#'
#' Example usage:
#'
#' cgatflow R readandfiltercounts
#' Rscript PATH/TO/filtercounts.R
#'
#' input: directory containing read count files or tsv file containing reads
#' additional input variables: method used to generate file, model
Expand Down
3 changes: 2 additions & 1 deletion cgatpipelines/tasks/bamstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1293,7 +1293,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
job_threads = 3
infile, genome = infiles
infile, genome, rRNA_intervals = infiles

if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
Expand All @@ -1307,5 +1307,6 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
OUTPUT=%(outfile)s
STRAND=%(strand)s
VALIDATION_STRINGENCY=SILENT
RIBOSOMAL_INTERVALS=%(rRNA_intervals)s
'''
P.run(statement)
24 changes: 18 additions & 6 deletions cgatpipelines/tasks/splicing.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class variables. Instead use an option string that can be set in

import os
import random
import shutil
import pandas as pd
import cgat.BamTools.bamtools as BamTools
import cgatcore.experiment as E
import cgatpipelines.tasks.expression as Expression
Expand Down Expand Up @@ -98,6 +100,7 @@ def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
with open(outdir + "/b2.txt", "w") as f:
f.write(group2)
readlength = BamTools.estimateTagSize(design.samples[0]+".bam")
tmpdir = P.get_temp_dir()

statement = '''rMATS
--b1 %(outdir)s/b1.txt
Expand All @@ -107,6 +110,7 @@ def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
--readLength %(readlength)s
--cstat %(pvalue)s
--libType %(strand)s
--tmp %(tmpdir)s
''' % locals()

# if Paired End Reads
Expand All @@ -117,10 +121,10 @@ def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
> %(outdir)s/%(designfile)s.log
'''

P.run(statement, job_condaenv="splicing")
P.run(statement)


def rmats2sashimi(infile, designfile, FDR, outfile):
def rmats2sashimi(infile, designfile, FDR, outfile, plotmax=20):
'''Module to generate sashimi plots from rMATS output
Module generates a statement to call rmats2sashimiplot and provides
Expand Down Expand Up @@ -161,17 +165,25 @@ def rmats2sashimi(infile, designfile, FDR, outfile):
event = os.path.basename(os.path.normpath(outfile))
if "MXE" in infile:
column = "22"
sortby = "25"
else:
column = "20"
sortby = "23"

temp = pd.read_csv(infile, sep='\t')
temp = temp.dropna()
temp.sort_values(by=['FDR'], inplace=True)
temp = temp[abs(temp['IncLevelDifference']) > 0.2]
plotnum = min(int(len(temp[temp['FDR'] < float(FDR)])), plotmax)
infile = P.snip(infile)
temp.iloc[1:plotnum,:].to_csv("%s_plot.txt" % infile, sep='\t', index=False)

statement = '''cat
%(infile)s|grep -v NA|
awk '$%(column)s < %(FDR)s' > %(infile)s_sig.txt;
statement = '''
rmats2sashimiplot
--b1 %(group1)s
--b2 %(group2)s
-t %(event)s
-e %(infile)s_sig.txt
-e %(infile)s_plot.txt
--l1 %(group1name)s
--l2 %(group2name)s
-o %(outfile)s
Expand Down
47 changes: 46 additions & 1 deletion cgatpipelines/tools/pipeline_bamstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,48 @@ def countReads(infile, outfile):

P.run(statement)


@transform(PARAMS["annotations_interface_geneset_all_gtf"],
regex("(\S+).gtf.gz"),
"rRNA_intervals.tsv")
def buildRrnaIntervals(infile, outfile):
'''
Builds a reference list of rRNA intervals for Picard
Parameters
----------
infile: str
path to the GTF file containing transcript and gene level annotations
genome_dir: str
:term: `PARAMS` the directory of the reference genome
genome: str
:term: `PARAMS` the filename of the reference genome (without .fa)
outfile: str
path to output file
'''

genome_index = os.path.abspath(
os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa.fai"))
tempfile = P.get_temp_filename()

statement = r'''
cut -f1,2 %(genome_index)s |
awk -F, '{$1="@SQ\tSN:" $1;}1' OFS='\t'|
awk 'BEGIN{OFS="\t"}$3="LN:"$3'
> %(outfile)s;
zcat %(infile)s | grep 'gene_biotype "rRNA"' |
awk '$3 == "exon"'|cgat gtf2tsv --attributes-as-columns|
grep -v '#'| cut -f1,4,5,7,10|sed '1d'|
sort -k1V -k2n -k3n
>> %(outfile)s
''' % locals()

P.run(statement)




#########################################################################
# QC tasks start here
#########################################################################
Expand Down Expand Up @@ -626,7 +668,8 @@ def buildTranscriptProfiles(infiles, outfile):
@P.add_doc(bamstats.buildPicardRnaSeqMetrics)
@transform(intBam,
regex("BamFiles.dir/(.*).bam$"),
add_inputs(PARAMS["annotations_interface_ref_flat"]),
add_inputs(PARAMS["annotations_interface_ref_flat"],
buildRrnaIntervals),
r"Picard_stats.dir/\1.picard_rna_metrics")
def buildPicardRnaSeqMetrics(infiles, outfile):
'''Get duplicate stats from picard RNASeqMetrics '''
Expand All @@ -637,6 +680,8 @@ def buildPicardRnaSeqMetrics(infiles, outfile):
strand = "FIRST_READ_TRANSCRIPTION_STRAND"
else:
strand = "NONE"


bamstats.buildPicardRnaSeqMetrics(infiles, strand, outfile,
PICARD_MEMORY)

Expand Down
5 changes: 0 additions & 5 deletions cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,7 +1039,6 @@ def filterDESeq2(infiles, outfile, design_name, quantifier_name):

counts, design = infiles
transcripts, genes = counts
design_name = design_name.lower()
quantifier_name = quantifier_name.lower()
counts = "--counts-dir %s.dir" % quantifier_name
model = PARAMS.get('deseq2_model%s' % design_name, None)
Expand Down Expand Up @@ -1081,7 +1080,6 @@ def filterEdgeR(infiles, outfile, design_name, quantifier_name):

counts, design = infiles
transcripts, genes = counts
design_name = design_name.lower()
quantifier_name = quantifier_name.lower()
counts = "--counts-dir %s.dir" % quantifier_name
model = PARAMS.get('edger_model%s' % design_name, None)
Expand Down Expand Up @@ -1123,7 +1121,6 @@ def filterEdgeR(infiles, outfile, design_name, quantifier_name):
def exploratoryAnalysis(infile, outfile, design_name, quantifier_name, detool_name):


design_name = design_name.lower()
design = "design" + design_name + ".tsv"
outdir = os.path.dirname(outfile)
r_root = os.path.abspath(os.path.dirname(cgatpipelines.__file__))
Expand Down Expand Up @@ -1166,7 +1163,6 @@ def exploratoryAnalysis(infile, outfile, design_name, quantifier_name, detool_na
def runDESeq2(infile, outfile, design_name, quantifier_name):
''' run DESeq2 to identify differentially expression transcripts/genes'''

design_name = design_name.lower()
design = "design" + design_name + ".tsv"
outdir = os.path.dirname(outfile)
r_root = os.path.abspath(os.path.dirname(cgatpipelines.__file__))
Expand Down Expand Up @@ -1216,7 +1212,6 @@ def runDESeq2(infile, outfile, design_name, quantifier_name):
def runEdgeR(infile, outfile, design_name, quantifier_name):
''' run EdgeR to identify differentially expression transcripts/genes'''

design_name = design_name.lower()
design = "design" + design_name + ".tsv"
outdir = os.path.dirname(outfile)
r_root = os.path.abspath(os.path.dirname(cgatpipelines.__file__))
Expand Down
Loading

0 comments on commit 87fcdbc

Please sign in to comment.