-
Notifications
You must be signed in to change notification settings - Fork 0
/
auto_finder_PAW.r
172 lines (134 loc) · 5.37 KB
/
auto_finder_PAW.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# library imports
library(tidyverse)
library(stringr)
library(limma)
library(psych)
library(Matrix)
# load the raw data: the table starts in Row 5
# there are extra lines at the bottom - need to stop before the end
data_import <- read_tsv("PAW_labeled_grouped_protein_summary_TMT_8.txt",
skip = 4, n_max = 4437, guess_max = 4437)
# the "Filter" column flags contams and decoys
data_all <- filter(data_import, is.na(Filter))
# check number of rows before and after filtering
print("Table lengths before and after filtering:")
cat(nrow(data_import), nrow(data_all))
# make a data frame for each TMT-plex (and drop zeros)
get_data <- function(df, substring) {
# extracts PAW TMT columns containing a substring
# df: data frame of PAW results
# substring: a text substring that defines one TMT-plex
# get the columns
df <- df %>%
select(starts_with("TotInt")) %>%
select(contains(substring))
# drop rows with zeros
df[apply(df, 1, function(row) all(row !=0 )), ] # return filtered frame
}
# get each experiment and set column names to something easy to parse
A_raw <- get_data(data_all, "_A_")
B_raw <- get_data(data_all, "_B_")
C_raw <- get_data(data_all, "_C_")
D_raw <- get_data(data_all, "_D_")
# automatically find closest channel pairs
SL_norm <- function(df, suffix, print_factors = TRUE) {
# Normalizes each channel's sum to the average grand total
# df: data frame of TMT data (one column for each channel)
# suffix: a text string to append to 1-to-n numbers for column names
# print_factors: logical to control printing
# compute norm factors and scale columns
norm_facs <- mean(c(colSums(df))) / colSums(df)
df_sl <- sweep(df, 2, norm_facs, FUN = "*")
# simplify column names
colnames(df_sl) <- str_c(as.character(1:ncol(df)), suffix)
# print the normalization factors for QC check
if (print_factors == TRUE) {
cat("\nNormalization Factors: ", suffix, "\n ")
cat(sprintf("%s - %0.3f\n", colnames(df), norm_facs))
}
df_sl # return normalized data
}
diff <- function(df, x, y) {
# computes a standardized difference between two vectors
# df: data frame that contains x and y
# x: index of first column
# y: index of second column
# compute the difference and set the column name
diff <- 100 * abs(df[x] - df[y]) / rowMeans(df[c(x, y)])
colnames(diff) <- sprintf("%d_%d", x, y)
diff # return difference vector
}
iqr_grid <- function(df, print_grid = TRUE) {
# computes an NxN grid of interquartile ranges of diffs
# df: a data frame of TMT data
# print_grid: logical to control printing
# make an empty matrix ncol-by-ncol
iqrs <- matrix(0.0, ncol(df), ncol(df))
# populate an upper diagonal "diff" matrix
for (i in 1:ncol(df)) {
for (j in (i+1):ncol(df)) {
if (j > ncol(df)) {
break
}
iqrs[i, j] <- IQR(diff(df, i, j)[[1]])
}
}
# print the grid
if( print_grid == TRUE) {
cat("\nInterquartile range grid:\n")
print(round(iqrs, 2))
}
iqrs # return the iqr grid
}
find_best_pair <- function(df, nmax, suffix, title) {
# finds channel pairs having the smallest standardized differences
# df: data frame with TMT channels
# nmax: how many channels/pairs to check
# suffix: suffix to add to channel labels
# title: title string to use in plots
# first do SL normalization
df_sl <- SL_norm(df, suffix)
# compute the ird grid
iqrs <- iqr_grid(df_sl)
# find the nmax number of smallest IQR values
# from https://stackoverflow.com/questions/38664241/ranking-and-counting-matrix-elements-in-r
iqrs_long <- summary(Matrix(iqrs, sparse = TRUE))
iqrs_ordered <- iqrs_long[order(-iqrs_long[, 3], decreasing = TRUE),]
candidates <- iqrs_ordered[1, 1] # starting vector of channels
for (row in 1:nrow(iqrs_long)){
candidates <- c(candidates, iqrs_ordered[row, 1])
candidates <- c(candidates, iqrs_ordered[row, 2])
}
# get the set of indexes and print the best pairs
for (pair in 1:nmax) {
top_n <- sort(unique(candidates)[1:nmax])
i <- iqrs_ordered[pair, 1]
j <- iqrs_ordered[pair, 2]
cat(sprintf("\nPair #%d: (%d, %d) => %0.2f", pair, i, j, iqrs[i, j]))
}
# make the box plots for the top-n pairs
distributions <- list()
for (row in 1:nmax){
distributions[[row]] <- diff(df_sl, iqrs_ordered[row, 1], iqrs_ordered[row, 2])
}
boxplot(do.call(cbind, distributions), main = title)
# multi-panel scatter plots for the top-n channels
cat(pairs.panels(log10(df_sl[top_n]), lm = TRUE, main = title))
}
# nmax is how many top pairs to consider (3 or 4 are recommended values)
top_n <- 4
# find the standards in experiment A
# this is now a one-line function call
find_best_pair(A_raw, top_n, "_A", "Exp A")
# experiment B
find_best_pair(B_raw, top_n, "_B", "Exp B")
# experiment C
find_best_pair(C_raw, top_n, "_C", "Exp C")
# and experiment D
find_best_pair(D_raw, top_n, "_D", "Exp D")
# check the clustering
A_sl <- SL_norm(A_raw, "_A", print_factors = FALSE)
plotMDS(log10(A_sl), main = "Experiment A")
pairs.panels(log10(A_sl), main = "Exp. A - all 10")
# log the session
sessionInfo()