-
Notifications
You must be signed in to change notification settings - Fork 0
/
ArrayExpress.R
198 lines (153 loc) · 7.63 KB
/
ArrayExpress.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
########################
### ArrayExpress ###
########################
# Author: Hannes Reinwald
# Contact: hannes.reinwald@ime.fraunhofer.de
# README # ---------------------------------
# This is a training script to learn the basics of the ArrayExpress R package.
# https://www.bioconductor.org/packages/release/bioc/html/ArrayExpress.html
# ArrayExpress is a large public database for experimental RNAseq data.
# It stores Functional Genomics Data data from high-throughput functional genomics experiments,
# and provides these data for reuse to the research community.
# https://www.ebi.ac.uk/arrayexpress/
# It's entries are linked to the European Nucleotide Archive (ENA) where the raw fastq files are stored
# https://www.ebi.ac.uk/ena/browser/home
# We can use the ArrayExpress package to easily download raw and processed data as well as
# meta information about the samples via their accession number. With the package this can be
# quickly done in R.
##########
### Install ArrayExpress ###
if (!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")}
BiocManager::install("ArrayExpress")
### Load the package ###
require(ArrayExpress)
browseVignettes("ArrayExpress")
### Prepare environment ###
setwd("../") #go one dir up (out of the git repo)
home = getwd()
#############################################
### Download data from ArrayExpress via R ###
#############################################
# In order to download data from ArrayExpress you need to know the accession number of the
# database entries which you would like to download.
# You can search the database online through your browser:
# https://www.ebi.ac.uk/arrayexpress/
# or directly in R with ?queryAE()
# Now we are searching AE for danio rerio with the keyword "thyroid"
sets = queryAE(keywords = "thyroid", species = "danio+rerio")
# Q.1) Which ArrayExpress (AE) entries provide data that was published in a scientific journal?
# Q.2) Which substances were tested in those experiments?
# Q.3) How would you search for entries linked to cancer research in zebrafish? How many entries can you find?
# Try to retrieve cancer related entries for humans. How many can you find there?
### Import an ArrayExpress dataset into R ### ----------------------------
# Once you know which identifier you wish to retrieve, you can use
# ?ArrayExpress() to load the data into R
#rawset = ArrayExpress(accession = "E-MTAB-9056")
# This bugs! Tells me there is no raw data although there is ...
### use getAE to store Arrayexpress datasets locally ### ------------------
# To organize the output a little bit nice I build these functions.
# AEDownload will generate a folder named after the !single! accession number provided and stores
# the files in there. It will further remove any zipped files.
# If import = TRUE the function will generate an R list object named after the accession containing
# the idf, sdrf,, and processed matrix files.
AEDownload <- function(accession, type = "processed", out = getwd(), import = T) {
if(length(accession)>1){stop("length(accession) > 1. Please provide only a single accession number.")}
# create output dir
dir.create(paste0(out,"/ArrayExpress"), showWarnings = F)
setwd(paste0(out,"/ArrayExpress"))
# run getAE() in loop for accession numbers
i = accession
dir.create(i, showWarnings = F)
AE <- getAE(i, type = type, path = i, extract = T)
zip <- list.files(i, full.names = T)[grep(".zip",list.files(i))]
if(file.exists(zip[1])) file.remove(zip)
# Import data in R list object
if(import == T) {
ls <- list()
sdrf <- list.files(i, full.names = T)[grep(".sdrf.txt",list.files(i))]
idf <- list.files(i, full.names = T)[grep(".idf.txt",list.files(i))]
ls[["sdrf"]] <- read.delim(file = sdrf, row.names = 1)
ls[["idf"]] <- read.delim(file = idf)
for(k in AE$processedFiles) {
ls[[k]] <- read.table(file = paste(i,k,sep = "/"),row.names = 1)
}
ls[["info"]] <- AE
}
setwd("../")
if(import == T) return(ls)
}
# This function is ment for a bulk download without import into R. You can provide
# multiple accession IDs at once.
AEDownloadBulk <- function(accession, type = "processed", out = getwd()) {
# create output dir
dir.create(paste0(out,"/ArrayExpress"), showWarnings = F)
setwd(paste0(out,"/ArrayExpress"))
# run getAE() in loop for accession numbers
for(i in accession) {
dir.create(i, showWarnings = F)
getAE(i, type = type, path = i, extract = T)
zip <- list.files(i, full.names = T)[grep(".zip",list.files(i))]
if(file.exists(zip[1])) file.remove(zip)
}
setwd("../")
}
## Now we use the function defined above to download & import the following accessions into R
accession = c("E-MTAB-9056","E-MTAB-9054")
for(i in accession) {
x <- AEDownload(i)
assign(i,x)
}
rm(x,i)
# If you only wish to download the files without loading them into R simply run:
AEDownloadBulk(accession)
###################################
# Congrats! :)
# Now you have successfully downloaded the processed data files for "E-MTAB-9056" and "E-MTAB-9054".
# You should have two R objects with those names in you R environment now.
# Search your environment for E-MTAB-9056" and "E-MTAB-9054" and look at their structures
# (hint: str() and View())
# Q.4) What kind of objects are "E-MTAB-9056" and "E-MTAB-9054"?
# How many objects are stored in them? (hint: str() and View())
# Q.5) Both E-MTAB objects should contain an IDF & SDRF file. What does IDF and SDRF stand for? What kind of information is stored in them?
# ( Hint: Check the vignette: browseVignettes("ArrayExpress") )
# Let's extract the count matrices from the E-MTAB objects
mtx.6PTU = `E-MTAB-9054`[["6PTU_CountMatrix.txt"]]
mtx.T3 = `E-MTAB-9056`[["T3_CountMatrix.txt"]]
# Q.6) Look at the CountMatrices. How many ENSEMBL Gene IDs and how many Samples (Sample IDs) are in there?
# Do both experiments have the same number of samples?
# Now let us have a look at the sample annotation files.
# Therefore we are extracting the sdrf file with the annotation info
anno.6PTU = `E-MTAB-9054`[["sdrf"]]
anno.T3 = `E-MTAB-9056`[["sdrf"]]
colnames(anno.6PTU)
# As you might see when you are looking at the, they contain a lot of columns.
# Many of those columns we don't need for downstream analysis. Hence only select the following:
coln <- c("Characteristics.growth.condition.","Comment.replicate.",
"Comment.ENA_SAMPLE.","Characteristics.organism.","Assay.Name")
x = anno.6PTU[,coln]
x$Substance <- "6PTU"
coln2 <- c("Characteristics.growth.condition.","Comment.tank.spawning.group.",
"Comment.ENA_SAMPLE.","Characteristics.organism.","Assay.Name")
y = anno.T3[,coln2]
y$Substance <- "T3"
# Replace column names
colnames(x)[c(1:2,5)] <- colnames(y)[c(1:2,5)] <- c("Condition","Tank","ID")
# Rename "normal" with "control" in the x$Condition column
x[x$Condition %in% "normal","Condition"] <- "control"
# Finally! Let us export our filtered annotation file to a csv
write.csv2(x, file = "ArrayExpress/E-MTAB-9054/6PTU_coldata.csv")
write.csv2(y, file = "ArrayExpress/E-MTAB-9056/T3_coldata.csv")
# Well done! :) Now you have everything you need for a differential gene expression analysis.
# The required input is a count matrix of genes and a sample annotation file (coldata file).
# From here on we can continue with EdgeR or DESeq2
## Session Information & save Rdata ## ------------
sink(paste0(home,"/ArrayExpress/AE_SessionInfo.txt"))
print(date())
print(devtools::session_info())
sink()
save.image(paste0(home,"/ArrayExpress/AE.RData"))
# you can load this object via:
# load("path/to/file/biomaRt.RData"))
#######################################
### END OF SCRIPT ###
#######################################