-
Notifications
You must be signed in to change notification settings - Fork 1
/
Summary statistics of annotations.Rmd
136 lines (127 loc) · 3.2 KB
/
Summary statistics of annotations.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "Summary statistics of annotations For Fig. 7"
author: "xyz"
date: "2020/11/14"
output: html_document
---
### Public DB
```{r}
df <- readRDS("../temp/ncbiMasterProtein.rds")
df <-
df[df$`Found.in.Sample.Group:.high` != "Not Found" |
df$`Found.in.Sample.Group:.low` != "Not Found" , ]
df2 <- readRDS("../temp/metaMasterProtein.rds")
df2 <-
df2[df2$`Found.in.Sample.Group:.high` != "Not Found" |
df2$`Found.in.Sample.Group:.low` != "Not Found" , ]
ncbi <- read.table(
"../data/panamaNCBIblast2gotable.txt",
sep = "\t",
header = F,
stringsAsFactors = F,
skip = 1,
quote = "\""
)
ncbi <- ncbi[ncbi$V3 %in% df$Accession, ]
# EWC45899.1 can be found in ncbi nr now
setdiff(df$Accession,ncbi$V3)
df[df$Accession=="EWC45899.1","Sequence"]
# 4,319 proteins were indentified by Public DB, 3,184 got GO annotations, or[BLASTED, MAPPED, ANNOTATED] and [BLASTED, MAPPED]
# [BLASTED, MAPPED, ANNOTATED] [BLASTED, MAPPED] [BLASTED]
# 3029 155 1135
table(ncbi$V2)
ncbi <-
read.table(
"../data/ncbi.user.out.top",
sep = "\t",
header = F,
stringsAsFactors = F
)
ncbi <- ncbi[stringr::str_sub(ncbi$V1, 6) %in% df$Accession, ]
# 2438 proteins got KEGG annotations
sum(ncbi$V2 != "")
```
### meta
```{r}
meta <- read.table(
"../data/panamaMetablast2gotable.txt",
sep = "\t",
header = F,
stringsAsFactors = F,
skip = 1,
quote = "\""
)
meta <- meta[meta$V3 %in% df2$Accession, ]
# 18706 proteins were indentified by Meta DB,13023 got GO annotations
# [BLASTED, MAPPED, ANNOTATED] [BLASTED, MAPPED] [BLASTED] [NO-BLAST]
# 9527 3496 5683 241
table(meta$V2)
meta <-
read.table(
"../data/meta.user.out.top",
sep = "\t",
header = F,
stringsAsFactors = F
)
meta <- meta[stringr::str_sub(meta$V1, 6) %in% df2$Accession, ]
# 7813 proteins got KEGG annotations
sum(meta$V2 != "")
```
### plot
```{r}
ncbiB2gTag <- read.table(
text = "
Tag,Count
Blasted,4320
Go Annotated,3184
KEGG Annotated,2438",
header = T,
sep = ",",
stringsAsFactors = F
)
metaB2gTag <- read.table(
text = "
Tag,Count
Blasted,18706
Go Annotated,13023
KEGG Annotated,7813",
header = T,
sep = ",",
stringsAsFactors = F
)
drawDf <-
data.frame(
Tag = c(ncbiB2gTag$Tag, metaB2gTag$Tag),
Count = c(ncbiB2gTag$Count, metaB2gTag$Count),
Database = rep(c("Public", "Meta"), each = 3)
)
drawDf$Database <- factor(drawDf$Database, levels = c("Meta", "Public"))
library(ggplot2)
ggplot(drawDf, aes(x = Tag, y = Count, fill = Database)) +
ylim(0, 25000) +
geom_bar(
stat = "identity",
position = "dodge",
width = 0.8,
col = 'black'
) +
# add numbers
geom_text(
aes(label = Count),
position = position_dodge(width = 0.8),
hjust = -0.25,
size = 6
) +
coord_flip() +
theme(
axis.title.y = element_blank(),
text = element_text(size = 30),
axis.text = element_text(colour = "black")
) +
ggsave(
"../figure/Summary statistics of protein annotations.png",
width = 12.80,
height = 7.2,
dpi = 100
)
```