-
Notifications
You must be signed in to change notification settings - Fork 5
/
getTCGAData.R
127 lines (91 loc) · 3.3 KB
/
getTCGAData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
################################################################
# title: "Script Used for Fetching and Processing TCGA-BRCA mRNA
# and miRNA Datasets for the MLcps Manuscript."
# author: "Akshay"
################################################################
library(SummarizedExperiment)
library(TCGAbiolinks)
setwd("~/Desktop/")
################################################################
# mRNA Dataset
################################################################
query.exp <- GDCquery(
project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts",
sample.type = c("Primary Tumor","Solid Tissue Normal")
)
GDCdownload(
query = query.exp,
files.per.chunk = 100
)
brca.exp <- GDCprepare(
query = query.exp,
)
# get subtype information
infomation.subtype <- TCGAquery_subtype(tumor = "BRCA")
# get clinical data
information.clinical <- GDCquery_clinic(project = "TCGA-BRCA",type = "clinical")
# Which samples are Primary Tumor
samples.primary.tumour <- brca.exp$barcode[brca.exp$shortLetterCode == "TP"]
# which samples are solid tissue normal
samples.solid.tissue.normal <- brca.exp$barcode[brca.exp$shortLetterCode == "NT"]
dataPrep <- TCGAanalyze_Preprocessing(
object = brca.exp,
cor.cut = 0.6
)
dataNorm <- TCGAanalyze_Normalization(
tabDF = dataPrep,
geneInfo = geneInfoHT,
method = "gcContent"
)
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25
)
dataDEGs <- TCGAanalyze_DEA(
mat1 = dataFilt[,samples.solid.tissue.normal],
mat2 = dataFilt[,samples.primary.tumour],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.001 ,
logFC.cut = 2,
method = "glmLRT",
pipeline = "edgeR"
)
degCM=dataFilt[rownames(dataDEGs),]
degCM=as.data.frame(t(degCM))
degCM$status <- ifelse(rownames(degCM) %in% samples.solid.tissue.normal, "Normal", "Tumor")
write.csv(degCM, "TCGA-BRCA_new.csv", row.names = TRUE)
################################################################
# miRNA Dataset
################################################################
query.miRNA <- GDCquery(
project = "TCGA-BRCA",
experimental.strategy = "miRNA-Seq",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification"
)
GDCdownload(query = query.miRNA)
dataAssy.miR <- GDCprepare(query = query.miRNA)
rownames(dataAssy.miR) <- dataAssy.miR$miRNA_ID
# using read_count's data
read_countData <- colnames(dataAssy.miR)[grep("count", colnames(dataAssy.miR))]
dataAssy.miR <- dataAssy.miR[,read_countData]
colnames(dataAssy.miR) <- gsub("read_count_","", colnames(dataAssy.miR))
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataAssy.miR,
method = "quantile",
qnt.cut = 0.25
)
### metadata
samplesDown.miR <- getResults(query.miRNA,cols=c("cases"))
dataSmTP.miR <- TCGAquery_SampleTypes(barcode = samplesDown.miR,
typesample = "TP")
dataSmNT.miR <- TCGAquery_SampleTypes(barcode = samplesDown.miR,
typesample = "NT")
brcaMIR=as.data.frame(t(dataFilt))
brcaMIR$status <- ifelse(rownames(brcaMIR) %in% dataSmNT.miR, "Normal", "Tumor")
write.csv(brcaMIR, "TCGA-BRCA-miRNA.csv", row.names = TRUE)