-
Notifications
You must be signed in to change notification settings - Fork 1
/
deseq2.R
executable file
·263 lines (227 loc) · 12.3 KB
/
deseq2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("optparse"))
## parse command line arguments
option_list <- list(
make_option(c("-i", "--countFile"), help="input read count file (format: <id> <int> <int> <int> (..<int>..) | can be stdin)"),
make_option(c("-o", "--outDir"), default=".", help="output directory to keep results (default: %default)"),
make_option(c("-p", "--pvalue"), default="0.05", help="p-value (default: %default)"),
make_option(c("-n", "--normalized"), help="input file contains normalized read couts", action="store_true"),
make_option(c("-t", "--treatment"), help="A comma seperated list about description of the data, example: WT,WT,WT,KO,KO,KO (must be equal to the number of samples for which read count is measured in read count file)"),
make_option(c("-c", "--condition"), help="A comma seperated list about additonal description of the data, example: T1,T2,T2,T1,T2,T2 (must be equal to the number of samples for which read count is measured in read count file)"),
make_option(c("-x", "--noheader"), action="store_true", help="input file do not have header"),
make_option(c("-N", "--outFileName"), help="identifier for output files")
)
parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
opt <- parse_args(parser)
## check, if all required arguments are given
if(is.null(opt$countFile) | is.null(opt$treatment)) {
cat("\nProgram: deseq2.R (R script to identify differential expression using DESeq2)\n")
cat("Author: BRIC, University of Copenhagen, Denmark\n")
cat("Version: 1.0\n")
cat("Contact: pundhir@binf.ku.dk\n");
print_help(parser)
q()
}
## load libraries
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(session))
suppressPackageStartupMessages(library(ggplot2))
## create output directory, if does not exist
dir.create(file.path(opt$outDir), showWarnings = FALSE)
## start analysis
## next three lines to debug
#setwd("/Users/sachinpundhir/Lobster/project/rna-seq-analysis/matilda/result_CFUE_2Rep/06_deseq")
#countTable <- read.table("ensembl_transcripts_raw_count", header=TRUE, row.names=1)
#treatment <-unlist(strsplit("WT,WT,WT,KO,KO,KO", ","))
if(identical(opt$countFile, "stdin")==T) {
if(is.null(opt$noheader)) {
countTable <- read.table(file("stdin"), header=T, row.names=1)
} else {
countTable <- read.table(file("stdin"), header=F, row.names=1)
}
#if(!is.numeric(countTable[,2])) {
# t <- apply(countTable, 2, function(x) as.numeric(as.character(x)))
# row.names(t) <- row.names(countTable)[2:nrow(countTable)]
# countTable <- t
#}
} else {
if(is.null(opt$noheader)) {
countTable <- read.table(pipe(paste("grep -v \"^\\_\"", opt$countFile, sep=" ")), header=T, row.names=1)
} else {
countTable <- read.table(pipe(paste("grep -v \"^\\_\"", opt$countFile, sep=" ")), header=F, row.names=1)
}
#if(!is.numeric(countTable[,2])) {
# countTable <- read.table(pipe(paste("grep -v \"^\\_\"", opt$countFile, sep=" ")), header=T, row.names=1)
#}
}
## check number of treatments and conditions, if they are correctly provided
if(ncol(countTable)!=length(unlist(strsplit(opt$treatment, ","))) | (!is.null(opt$condition) & ncol(countTable)!=length(unlist(strsplit(opt$treatment, ","))))) {
cat("\nProgram: deseq2.R (R script to identify differential expression using DESeq2)\n")
cat("Author: BRIC, University of Copenhagen, Denmark\n")
cat("Version: 1.0\n")
cat("Contact: pundhir@binf.ku.dk\n");
print_help(parser)
q()
}
treatment <- toupper(unlist(strsplit(opt$treatment, ",")))
colTable <- as.data.frame(as.factor(treatment), row.names=colnames(countTable))
colnames(colTable) <- "treatment"
if(!is.null(opt$condition)) {
condition <- unlist(strsplit(opt$condition, ","))
colTable$condition <- as.factor(condition)
}
## check if differential expression is for read count which are a) not normalized, b) normalized
if(is.null(opt$normalized)) {
## modify this line to compare more than one set of factors (for example, treatment and condition both)
if(!is.null(opt$condition)) {
dds <- DESeqDataSetFromMatrix(countData = round(countTable, digits=0), colData = colTable, design = ~ condition + treatment)
} else {
dds <- DESeqDataSetFromMatrix(countData = round(countTable, digits=0), colData = colTable, design = ~ treatment)
}
colData(dds)$treatment <- factor(colData(dds)$treatment, levels=c(levels(colTable$treatment)))
dds <- DESeq(dds)
#dds <- DESeq(dds, minReplicatesForReplace=Inf)
} else {
## check, if input file is from a cufflinks output
## if yes, select only deconvoluted transcripts
cuff_dir="./";
if(grepl("/", opt$countFile)) {
cuff_dir <- gsub("/[^/]+$", "", opt$countFile)
}
if(grepl("isoforms", opt$countFile) & file.exists(sprintf("%s/isoforms.count_tracking", cuff_dir))){
cuff_tracking <- read.table(sprintf("%s/isoforms.count_tracking", cuff_dir), header=TRUE)
countTable$WT_status <- cuff_tracking$WT_status
countTable$KO_status <- cuff_tracking$KO_status
countTable <- countTable[(which(countTable$KO_status=="OK" & countTable$WT_status=="OK")),c(1:6)]
}
## modify this line to compare more than one set of factors (for example, treatment and condition both)
if(!is.null(opt$condition)) {
dds <- DESeqDataSetFromMatrix(countData = round(countTable, digits=0), colData = colTable, design = ~ condition + treatment)
} else {
dds <- DESeqDataSetFromMatrix(countData = round(countTable, digits=0), colData = colTable, design = ~ treatment)
}
colData(dds)$treatment <- factor(colData(dds)$treatment, levels=c(levels(colTable$treatment)))
normFactors <- matrix(runif(nrow(dds)*ncol(dds),1,1), ncol=ncol(dds),nrow=nrow(dds))
normalizationFactors(dds) <- normFactors
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
}
## retrieve results
#it makes no difference except that the results() function with no
#arguments will automatically extract results for the last variable in
#the design formula, and for the last level of this variable over the first
#level if this variable is a factor. But otherwise, no it doesn't
#make a difference if you are using the arguments of results() to specify
#which results tables to construct.
## modify this line to compare different set of treatments
#if(is.element("KO", treatment)) {
if(length(grep("ko", treatment, ignore.case = T))>0 & length(grep("wt", treatment, ignore.case = T))>0) {
res <- results(dds, contrast=c("treatment", "KO", "WT"))
} else {
res <- results(dds, contrast=c("treatment", unique(treatment)[2], unique(treatment)[1]))
}
#res <- results(dds, contrast=c("treatment", "t1", "t2"))
#res <- results(dds, contrast=c("treatment", "KO", "WT"), cooksCutoff=FALSE, independentFiltering=FALSE)
## merge results with normalized read counts
size_factors <- sizeFactors(dds)
countTable_norm <- t(apply(countTable, 1, function(x) x/size_factors))
resWithCounts <- merge(res, countTable_norm, by=0, all=TRUE)
resSig <- resWithCounts[which(resWithCounts$padj<as.numeric(opt$pvalue)),]
## define output file name
if(is.null(opt$outFileName)) {
outFile <- unlist(strsplit(opt$countFile, "/"))[length(unlist(strsplit(opt$countFile, "/")))]
} else {
outFile <- opt$outFileName
}
## print results sort by log2foldchange and p-value
## all genes
resWithCounts$class <- "neutral"
resWithCounts[which(resWithCounts$padj<0.05 & resWithCounts$log2FoldChange>0),]$class <- "up"
resWithCounts[which(resWithCounts$padj<0.05 & resWithCounts$log2FoldChange<0),]$class <- "down"
write.table(as.data.frame(resWithCounts[order(-abs(resWithCounts$log2FoldChange), resWithCounts$padj),]), file=sprintf("%s/%s.all.xls", opt$outDir, outFile), quote=F, sep="\t", row.names=F)
## differentially expressed genes
resSig$class <- "neutral"
resSig[which(resSig$padj<0.05 & resSig$log2FoldChange>0),]$class <- "up"
resSig[which(resSig$padj<0.05 & resSig$log2FoldChange<0),]$class <- "down"
write.table(as.data.frame(resSig[order(-abs(resSig$log2FoldChange), resSig$padj),]), file=sprintf("%s/%s.de.xls", opt$outDir, outFile), quote=F, sep="\t", row.names=F)
## kind of log output
#print(mcols(res, use.names=TRUE))
## perform quality check
suppressPackageStartupMessages(library("RColorBrewer"))
suppressPackageStartupMessages(library("gplots"))
hmcol <- colorRampPalette(brewer.pal(8, "RdBu"))(100)
rld <- rlogTransformation(dds, blind=TRUE)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
rownames(mat) <- colnames(mat) <- with(colData(dds), paste(rownames(colData(dds)), sep=":"))
pdf(sprintf("%s/%s.heatmap.pdf", opt$outDir, outFile))
heatmap.2(mat, trace="none", margin=c(13, 13), col=hmcol)
dev.off()
pdf(sprintf("%s/%s.quality.pdf", opt$outDir, outFile))
par(mfrow=c(2,2))
plotMA(dds, ylim=c(-2,2))
plotDispEsts(dds)
hist(res$pvalue, breaks=20, col="grey")
#plot(attr(res, "filterNumRej"), type="b", xlab="quantile of 'baseMean'", ylab="number of rejections")
dev.off()
#suppressPackageStartupMessages(library(limma)) ## plotMA is available in both DESeq2 and limma
pdf(sprintf("%s/%s.pca.pdf", opt$outDir, outFile))
#par(mfrow=c(2,2))
#plotMDS(log2(assays(dds)[["counts"]]))
#plotMDS(log2(assays(dds)[["mu"]]))
if(!is.null(opt$condition)) {
data <- plotPCA(rld, intgroup=c("treatment", "condition"), returnData=T)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=treatment, shape=condition, label=row.names(data))) + geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_text(data = data, size=4, color="#000000") + theme_bw()
} else {
data <- plotPCA(rld, intgroup="treatment", returnData=T)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=treatment, label=row.names(data))) + geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_text(data = data, size=4, color="#000000") + theme_bw()
}
dev.off()
## plot volcano plot
pdf(sprintf("%s/%s.volcano.pdf", opt$outDir, outFile), height=15, width=15)
df <- as.data.frame(res)
df$gene <- rownames(df)
df$gene <- gsub(".*_[0-9]+_", "", df$gene)
df <- df[,c(7,2,5,6)]
with(df, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
# Add colored points: red if padj<0.05, orange if log2FC>1, green if both)
#pvalue_threshold=as.numeric(opt$pvalue)
#pvalue_threshold=summary(-log10(res[which(res$padj<0.05),]$pvalue))[5]
pvalue_threshold=1
pvalue_threshold=1/exp(pvalue_threshold*log(10))
#with(subset(df, padj<pvalue_threshold ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
#with(subset(df, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), pch=20, col="orange"))
with(subset(df, padj<pvalue_threshold), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
# Label points with the textxy function from the calibrate plot
library(calibrate)
with(subset(df, padj<pvalue_threshold), textxy(log2FoldChange, -log10(pvalue), labs=gene, cex=.8))
dev.off()
## print normalized read count of genes
## this was WRONG (https://support.bioconductor.org/p/66067/)
#write.table(as.data.frame(assay(rld)), file=sprintf("%s/%s.normExpr.xls", opt$outDir, outFile), sep="\t", quote=F, col.names=T, row.names=T)
## This is RIGHT (use following instead) (sorted by log2fold change)
write.table(countTable_norm[order(-as.data.frame(res)[,2]),], file=sprintf("%s/%s.normExpr.xls", opt$outDir, outFile), sep="\t", quote=F, col.names=F, row.names=T)
## print size factors for samples
print(size_factors)
## save the session for further use
save.session(sprintf("%s/%s.session", opt$outDir, outFile))
## code for unknown batch effects
#If you have known batches, just include the batch variable in the design for DESeq2.
#We don't recommend testing on transformed counts.
#If you have unknown batches, you can use svaseq or other packages. We are writing up a workflow which will be released in a few weeks and includes svaseq and RUVSeq.
#But briefly, add the SVA surrogate variables (columns of 'sv') to the colData, and then add these to the design. E.g., for two surrogate variables:
#Code:
#dds$SV1 <- svseq$sv[,1]
#dds$SV2 <- svseq$sv[,2]
#design(dds) <- ~ SV1 + SV2 + condition
#dds <- DESeq(dds)
## normalized counts after batch correction
#fitted.common.scale = t( t( assays(dds)[["mu"]] ) / sizeFactors(dds) )