-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_rnaseq_cellines_CMP.r
409 lines (295 loc) · 16.2 KB
/
process_rnaseq_cellines_CMP.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
########################################################################
#
# process_rnaseq_cellines
# process RNAseq data from Cell Lines. From raw data to voom and ComBat
# batch corrected.
#
# Copyright (C) 2019 saezlab, Uniklinikum Heidelberg
# Author: Rosa Hernansaiz-Ballesteros
# addapted verion of process_rnaseq_cellines.r for CMP releases
# (https://github.com/saezlab/PathActivities/blob/master/src/transcriptomics/rnaseq_cell_lines/process_rnaseq_cellines.r)
#
# process_rnaseq_cellines_CMP.r
# Pipeline to process RNAseq data from Cell Model Pasports releases
# (https://cellmodelpassports.sanger.ac.uk/)
# From the raw data, the pipeline:
# - Imputes 0 to NA values
# - Removes samples exeding genes with 0
# - Removes non-expressed genes with average CPM <=0
# - Normalises through TMM method
# - Runs Voom transformation
# - Uses ComBat for Batch correction of voom transformed data
# - Merges duplicates by the mean
#
# process_rnaseq_cellines is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# process_rnaseq_cellines is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CARNIVAL1000. If not, see <https://www.gnu.org/licenses/>.
#
########################################################################
rm(list=ls());cat("\014");if(length(dev.list())>0){dev.off()};options(warn=-1)
library(data.table)
library(dplyr)
library(stringr)
library(limma)
library(sva)
library(edgeR)
library(ggplot2)
library(ggforce)
library(ggfortify)
###### set paths and variables ######
wd = '~/Projects/process_rnaseq_cellines/CMP' #working directory
setwd(wd)
#metasample = '../../00-CMP/metadata/model_list_2019-04-04_0916.csv' #vector containing the path to the metainformation about the cell lines
metasample = '../../00-CMP/metadata/model_list_2019-06-21_1535.csv'
rnadata = '../../00-CMP/expression/rnaseq_2019-04-15_1133.csv'#vector containing the path to the RNAseq file of cell lines
###### Loading datasets ######
cat('Load expression data \n')
cmpRAW <- fread(rnadata, sep = ',')
cat('Load clinics data \n')
metaSMP <- read.delim(metasample, sep = ',')
##### Matrix extraction #####
message('Create matrix Gene x Sample \n')
models <- unique(cmpRAW$model_id) #unique samples
## as the original dataset from Cell Model Pasports contains samples from 2 different sources (Sanger and Broad),
## the samples are separarated based on the source to run the pipeline, as they have different number of genes:
pb <- txtProgressBar(min = 0, max = length(models), style = 3) # create progress bar
for (i in 1:length(models)) { #for each of the unique samples
setTxtProgressBar(pb, i) #update progress bar
model <- models[i]
df <- cmpRAW[which(cmpRAW$model_id==model),c('gene_id','read_count', 'dataset_name')] #get the relevant information
dset <- unique(df$dataset_name)
for(institute in dset){
dfsub <- df[which(df$dataset_name==institute),c('gene_id', 'read_count')]
colnames(dfsub) <- c('gene_id', model)
if(is.null(EXP_rawcounts[[institute]])){
EXP_rawcounts[[institute]] <- dfsub
}else{
EXP_rawcounts[[institute]] <- merge(EXP_rawcounts[[institute]],dfsub, by = 'gene_id')
}
}
}
close(pb)
rm(pb, df, dfsub, i, dset, institute, model)
# transforming data.table to data.frame
names(EXP_rawcounts) <- gsub("Sanger RNASeq", 'sanger', names(EXP_rawcounts))
names(EXP_rawcounts) <- gsub("Broad \\(CCLE\\) RNASeq", 'broad', names(EXP_rawcounts))
EXP_rawcounts[['sanger']] <- setDF(EXP_rawcounts[['sanger']], rownames=EXP_rawcounts[['sanger']]$gene_id)
EXP_rawcounts[['sanger']] <- EXP_rawcounts[['sanger']][,-1]
EXP_rawcounts[['broad']] <- setDF(EXP_rawcounts[['broad']], rownames=EXP_rawcounts[['broad']]$gene_id)
EXP_rawcounts[['broad']] <- EXP_rawcounts[['broad']][,-1]
saveRDS(EXP_rawcounts, file = '1907_release1904/rnaseq_2019-06-26_geneVSmodel_list.rds')
# EXP_rawcounts <- readRDS('1907_release1904/rnaseq_2019-06-26_geneVSmodel_list.rds')
print(lapply(EXP_rawcounts, dim))
# $sanger
# [1] 35004 447
#
# $broad
# [1] 37260 706
#### Exploration of data
gg.venn <- data.frame(x = c(-0.866, 0.866),
y = c(0, 0),
labels = c('Sanger', 'Broad'))
gg.data.samples <- list(shared = intersect(colnames(EXP_rawcounts$sanger), colnames(EXP_rawcounts$broad)),
sanger = as.character(colnames(EXP_rawcounts$sanger) [ which( !colnames(EXP_rawcounts$sanger)%in%colnames(EXP_rawcounts$broad) ) ]),
broad = as.character(colnames(EXP_rawcounts$broad) [ which( !colnames(EXP_rawcounts$broad)%in%colnames(EXP_rawcounts$sanger) ) ]) )
gg.sample <- as.data.frame(matrix(data=NA, nrow=3, ncol = 4, dimnames = list(NULL, c('label', 'x', 'y', 'count'))), stringsAsFactors = F)
gg.sample$label <- c('shared', 'Sanger', 'Broad')
gg.sample$x <- c(0, -1.5, 1.5)
gg.sample$y <- c(0, 0, 0)
gg.sample$count <- c(length(gg.data.samples$shared), length(gg.data.samples$sanger), length(gg.data.samples$broad))
gg.data.genes <- list(shared = intersect(rownames(EXP_rawcounts$sanger), rownames(EXP_rawcounts$broad)),
sanger = as.character(rownames(EXP_rawcounts$sanger) [ which( !rownames(EXP_rawcounts$sanger)%in%rownames(EXP_rawcounts$broad) ) ]),
broad = as.character(rownames(EXP_rawcounts$broad) [ which( !rownames(EXP_rawcounts$broad)%in%rownames(EXP_rawcounts$sanger) ) ]) )
gg.genes <- as.data.frame(matrix(data=NA, nrow=3, ncol = 4, dimnames = list(NULL, c('label', 'x', 'y', 'count'))), stringsAsFactors = F)
gg.genes$label <- c('shared', 'Sanger', 'Broad')
gg.genes$x <- c(0, -1.5, 1.5)
gg.genes$y <- c(0, 0, 0)
gg.genes$count <- c(length(gg.data.genes$shared), length(gg.data.genes$sanger), length(gg.data.genes$broad))
png('xplr/ven_samples_SB.png', height = 600, width = 600, res = 150)
ggplot(gg.venn, aes(x0 = x, y0 = y, r = 1.5, fill = labels)) +
geom_circle(alpha = .3, size = 1, colour = 'grey') +
coord_fixed() +
labs(fill = NULL) +
theme_void() + theme(legend.position = 'bottom') +
annotate("text", x = gg.sample$x, y = gg.sample$y, label = gg.sample$count , size = 10) +
ggtitle('Samples CMPs (2019-04-15_1133)') +
scale_fill_manual(values = c('#E69F00', '#009E73', '#0072B2'))
dev.off()
png('xplr/ven_samples_SB.png', height = 600, width = 600, res = 150)
ggplot(gg.venn, aes(x0 = x, y0 = y, r = 1.5, fill = labels)) +
geom_circle(alpha = .3, size = 1, colour = 'grey') +
coord_fixed() +
labs(fill = NULL) +
theme_void() + theme(legend.position = 'bottom') +
annotate("text", x = gg.genes$x, y = gg.genes$y, label = gg.genes$count , size = 10) +
ggtitle('Samples CMPs (2019-04-15_1133)') +
scale_fill_manual(values = c('#E69F00', '#009E73', '#0072B2'))
dev.off()
message('Remove samples with unknown tissue\n')
#There are none for this dataset
tissue = unique(metaSMP$tissue)
samples = intersect(unique(c(colnames(EXP_rawcounts[['sanger']]), colnames(EXP_rawcounts[['broad']]))),
metaSMP$model_id[ which(metaSMP$tissue %in% tissue) ])
EXP_rawcounts <- lapply(EXP_rawcounts, function(EXP){EXP[ , colnames(EXP) %in% samples ]})
message('Imputing NA (ZEROmethod) \n')
#Therea are none for this dataset
exp_na <- lapply(EXP_rawcounts, function(EXP){# save gene:sample pairs that are NAs
index <- which(is.na(EXP), arr.ind=T)
paste(rownames(EXP)[index[,'row']], colnames(EXP)[index[,'col']], sep=':')
})
if ( !is.null(do.call(c,lapply(exp_na,dim))) ){
EXP_rawcounts <- lapply(EXP_rawcounts, function(EXP){EXP[ which(is.na(EXP)) ] = 0})
}
cat('Total genes and cell lines: \n')
print(lapply(EXP_rawcounts, dim))
#### Exploratory analysis of the raw counts distribution and the sequencing depth ####
# Raw counts distribution
png('xplr/rnaseq_2019-06-26_rawcounts_distribution.png', height = 800, width = 900, res = 150)
par(mfrow = c(1,2))
hist(log2(as.numeric(unlist(EXP_rawcounts[['sanger']]))), breaks = 100, main = 'Cell lines raw counts Sanger', xlab = 'log2(raw counts)')
hist(log2(as.numeric(unlist(EXP_rawcounts[['broad']]))), breaks = 100, main = 'Cell lines raw counts Broad', xlab = 'log2(raw counts)')
dev.off()
# Sequencing Depth
png('xplr/rnaseq_2019-06-26_sequncing_depth.png', height = 800, width = 900, res = 150)
par(mfrow = c(1,2))
barplot(colSums(EXP_rawcounts$sanger), main = 'Cell lines raw counts Sanger', xlab = 'Sum raw counts per Sample')
barplot(colSums(EXP_rawcounts$broad), main = 'Cell lines raw counts Broad', xlab = 'Sum raw counts per Sample')
dev.off()
#### IMPORTANT
### Because the sequencing depth is different, the datasets won't be merge before the normalitation
####
rmSamples = F # Turn it T to remove samples that exeed genes with 0
if (rmSamples){
message('Remove samples exeding genes with 0\n')
zero.count = lapply(EXP_rawcounts,function(EXP){
apply(EXP, 2, function(x) sum(x==0) ) / nrow(EXP)
})
print(lapply(zero.count, summary))
zeroThreshold = NULL#Threshold to remove samples. Original script used .4
EXP_rawcounts[['sanger']] = EXP_rawcounts[['sanger']][ , ! colnames(EXP_rawcounts[['sanger']]) %in% names(zero.count[['sanger']])[ zero.count[['sanger']] > zeroThreshold ] ]
EXP_rawcounts[['broad']] = EXP_rawcounts[['broad']][ , ! colnames(EXP_rawcounts[['broad']]) %in% names(zero.count[['broad']])[ zero.count[['broad']] > zeroThreshold ] ]
}
message('Remove non-expressed genes with average CPM <= 0 ')
# "The limma-voom method assumes that rows with zero or very low counts have been removed"
exp.zero <- lapply(EXP_rawcounts, function(EXP){
index = aveLogCPM(EXP) <= 0
rownames(EXP)[index]
})
saveRDS(exp.zero, file = '1907_release1904/rnaseq_2019-06-26_expZero_list.rds')
# exp.zero <- readRDS(file = '1907_release1904/rnaseq_2019-06-26_expZero_list.rds')
EXP_rawcounts <- lapply(EXP_rawcounts, function(EXP){
keep = aveLogCPM(EXP) > 0
EXP [ keep, ]
})
cat('Total genes and cell lines: \n')
print(lapply(EXP_rawcounts, dim))
saveRDS(EXP_rawcounts, file = '1907_release1904/rnaseq_2019-06-26_rmZeroGenes_list.rds')
# EXP_rawcounts <- readRDS(file = '1907_release1904/rnaseq_2019-06-26_rmZeroGenes_list.rds')
message('Normalization\n')
# Although it is also possible to give a matrix of counts directly to voom without TMM normalization,
# limma package reommends it
EXP_rawcounts = lapply(EXP_rawcounts, function(EXP){DGEList(counts=EXP,genes=rownames(EXP))})
EXP_normcounts = lapply(EXP_rawcounts, function(EXP){calcNormFactors(EXP, method = 'TMM')})
saveRDS(EXP_normcounts, file = '1907_release1904/rnaseq_2019-06-26_tmm_norm_list.rds')
# EXP_normcounts <- readRDS(file = 'rnaseq_2019-06-26_tmm_norm_list.rds')
# "method="TMM" is the weighted trimmed mean of M-values (to the reference) proposed by Robinson
# and Oshlack (2010), where the weights are from the delta method on Binomial data. If refColumn
# is unspecified, the library whose upper quartile is closest to the mean upper quartile is used.
# method="RLE" is the scaling factor method proposed by Anders and Huber (2010). We call it
# "relative log expression", as median library is calculated from the geometric mean of all columns
# and the median ratio of each sample to the median library is taken as the scale factor.
# method="upperquartile" is the upper-quartile normalization method of Bullard et al (2010), in
# which the scale factors are calculated from the 75% quantile of the counts for each library, after
# removing genes which are zero"
message('Voom transformation\n')
EXP_voom = lapply(EXP_normcounts, function(EXP){voom(EXP, plot = T)$E}) # $E extract the "data corrected for variance"
saveRDS(EXP_voom, file = '1907_release1904/rnaseq_2019-06-26_voom_list.rds')
# EXP_voom <- readRDS(file = '1907_release1904/rnaseq_2019-06-26_voom_list.rds')
message('Building merged matrix \n')
genes = Reduce(intersect, list(rownames(EXP_voom$sanger), rownames(EXP_voom$broad)))
tmp <- lapply(EXP_voom, function(EXP, genes){
EXP <- as.data.frame(cbind(genes=rownames(EXP), EXP),stringsAsFactors=F)
EXP[ genes, ]},genes)
cat('Total genes and cell lines: \n')
print(lapply(tmp, dim))
colnames(tmp$sanger) = paste0('S.',colnames(tmp$sanger))
colnames(tmp$broad) = paste0('B.',colnames(tmp$broad))
EXP <- merge(tmp$sanger, tmp$broad, by.x = 'S.genes', by.y = 'B.genes', row)
rownames(EXP) <- EXP$S.genes
EXP <- EXP[,2:ncol(EXP)]
EXP <- data.matrix(EXP, rownames.force = T)
cat('Total genes and cell lines:', dim(EXP), '\n')
#### Exploratory analysis through PCA ####
## variables
catgroup <- as.data.frame(cbind(id=colnames(EXP),sample=gsub('[S/B]\\.','',colnames(EXP)),dataset=substr(colnames(EXP),1,1)),stringsAsFactors = F)
catgroup <- merge(catgroup, metaSMP[,c("model_id","tissue","tissue_status", "cancer_type")], by.x='sample', by.y='model_id')
catgroup <- catgroup[,-1]
# catgroup$suppliers <- unlist(lapply(strsplit(as.character(catgroup$suppliers), split=':'),function(X){X[1]}))
### sequencing Depth
# sequencingDepth <- do.call(c,lapply(EXP_rawcounts, colSums))
# names(sequencingDepth) <- gsub('sanger','S',names(sequencingDepth))
# names(sequencingDepth) <- gsub('broad','B',names(sequencingDepth))
#
# catgroup <- merge(catgroup, as.data.frame(cbind(id=names(sequencingDepth),sDepth=sequencingDepth),stringsAsFactors=F), by='id')
## PCA
exp.pca <- prcomp(t(EXP))
str(exp.pca)
exp.pca.variances <- ((exp.pca$sdev^2) / (sum(exp.pca$sdev^2)))*100
png('xplr/rnaseq_2019-06-26_PCA_barplot.png', height = 800, width = 900, res = 150)
barplot(exp.pca.variances, cex.names=1, xlab=paste("Principal component (PC), 1-", length(exp.pca$sdev)),
ylab="Proportion of variation (%)", main="Scree plot", ylim=c(0,20))
dev.off()
png('xplr/rnaseq_2019-06-26_PCA.png', height = 800, width = 900, res = 150)
qplot(exp.pca$x[,1], exp.pca$x[,2], data=catgroup,
color= cancer_type, geom = c('point'), label = dataset,#,'text'
xlab=paste("PC1, ", round(exp.pca.variances[1], 2), "%"),
ylab=paste("PC2, ", round(exp.pca.variances[2], 2), "%"))
dev.off()
##
message('Batch correction of voom transfromed data using ComBat\n')
cat('Using "tisue" and duplicates as covariates\n')
batch = substr(colnames(EXP),1,1)
cov_tissue = as.character(metaSMP$tissue[ match( gsub('[S/B]\\.','',colnames(EXP)), metaSMP$model_id) ])
duplicates = gsub('[S/B]\\.','',colnames(EXP)[duplicated( gsub('[S/B]\\.','',colnames(EXP)) )])
cov_duplicates = sapply(duplicates, function(id) (metaSMP$model_id[ match(gsub('[S/B]\\.','',colnames(EXP)), metaSMP$model_id) ] == id) + 0)
cov_duplicates[ is.na(cov_duplicates) ] = 0
covariates = model.matrix(~cov_tissue+cov_duplicates)
EXP_corvoom = ComBat(EXP, batch = batch, mod = covariates, par.prior = T, prior.plots = T)
saveRDS(EXP_corvoom, file = '1907_release1904/rnaseq_2019-06-26_voom_batchcor.rds')
# EXP_corvoom <- readRDS(file = 'rnaseq_2019-06-26_voom_batchcor.rds')
## PCA
exp.pca <- prcomp(t(EXP_corvoom))
str(exp.pca)
exp.pca.variances <- ((exp.pca$sdev^2) / (sum(exp.pca$sdev^2)))*100
png('xplr/rnaseq_2019-06-26_PCAcomBat_barplot.png', height = 800, width = 900, res = 150)
barplot(exp.pca.variances, cex.names=1, xlab=paste("Principal component (PC), 1-", length(exp.pca$sdev)),
ylab="Proportion of variation (%)", main="Scree plot", ylim=c(0,20))
dev.off()
png('xplr/rnaseq_2019-06-26_PCAcomBat.png', height = 800, width = 900, res = 150)
qplot(exp.pca$x[,1], exp.pca$x[,2], data=catgroup,
color= dataset, geom = c('point'), label = dataset,#,'text'
xlab=paste("PC1, ", round(exp.pca.variances[1], 2), "%"),
ylab=paste("PC2, ", round(exp.pca.variances[2], 2), "%"))
dev.off()
##
message('Merging duplicates with the mean')
samples = unique( gsub('[S/B]\\.','',colnames(EXP)) )
genes = rownames(EXP_corvoom)
X = matrix(NA, nrow = length(genes), ncol = length(samples), dimnames = list(genes, samples))
for( s in samples){
x = EXP_corvoom[ , grep(s, colnames(EXP_corvoom)) ]
if ( is.matrix(x) )
x = apply(x, 1, mean)
X[, s] = x
}
EXPmerged = X
saveRDS(EXPmerged, file = '1907_release1904/rnaseq_2019-06-26_voom_batchcor_merged.rds')