-
Notifications
You must be signed in to change notification settings - Fork 1
/
polan.py
371 lines (323 loc) · 15.5 KB
/
polan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
"""polan provides functions for simulating polysome profiles from other types of data.
The main functions intended for users are:
fp2poly: models polysome peak information from a set of ribosome footrinting data
plot_poly: generates a plot using output from fp2poly
compare_profiles: compares two ribosome footprinting datasets based on analyses of mRNA shifts between peaks in a corresponding polysomes
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
RNA_ref = pd.read_csv('Data/RNA_reference.csv') # read only once.
#read in gene length information
genes = pd.read_csv('Data/sacCer3 genes.csv')
def find_poly_peaks(trace, run_length=4, exclude_beginning=0.1):
#read out some basic trace parameters
trace_x_max = max(trace['x'])
#ensure that x-values are ordered
#(this may not be the case with plots produced via webplotdigitizer)
trace = trace.sort_values('x')
points = trace['y']
points_prime = [points[y]-points[y-1] for y in range(1, len(points))]
#determine positive runs
pos_run_ends = []
ploc = 0
run_length = 4
while ploc < len(points_prime)-1:
if points_prime[ploc] > 0:
pos_start = ploc
ploc += 1
while points_prime[ploc] > 0 and ploc < len(points_prime)-1:
ploc += 1
pos_end = ploc
if pos_end-pos_start > run_length:
pos_run_ends.append(pos_end)
else:
ploc += 1
#determine negative runs
neg_run_starts = []
nloc = 0
while nloc < len(points_prime)-1:
if points_prime[nloc] < 0:
neg_start = nloc
nloc += 1
while points_prime[nloc] < 0 and nloc < len(points_prime) - 1:
nloc += 1
neg_end = nloc
if neg_end - neg_start > run_length:
neg_run_starts.append(neg_start)
else:
nloc += 1
#determine peak positions as the mean in x between a pos run end and the
# closest following neg run start
global peakxs
peakxs = []
neg_start_ys = trace.iloc[neg_run_starts]['x'].values
for entry in range(len(pos_run_ends)):
pos_end_x = trace.iloc[pos_run_ends[entry]]['x']
match_neg_start_x = min(neg_start_ys[neg_start_ys >= pos_end_x])
#ensure that the pos run end and neg run start are not further than
# a maximum distance apart:
if (match_neg_start_x - pos_end_x) < (trace_x_max / 30):
#exclude cases where there is a second pos run end prior to the
# following matching neg run start
#either include because this is the last value in pos_run_ends
if entry == len(pos_run_ends)-1:
print('ps_run_end length reached')
peakxs.append((pos_end_x + match_neg_start_x)/2)
#or include if there is no additional pos run between the current pos run and
# the next neg run
elif match_neg_start_x < trace.iloc[pos_run_ends[entry + 1]]['x']:
#exclude peaks near the beginning of the gradient, which are usually dirt
if pos_end_x > trace_x_max * exclude_beginning:
peakxs.append((pos_end_x + match_neg_start_x)/2)
return peakxs
def plot_trace(trace, peakxlocs=[], add_x=0, predxlocs=[]):
_, ax = plt.subplots()
ax.scatter(trace['x'], trace['y'], c='black', s=2)
if len(peakxlocs) > 0:
for loc in peakxlocs:
ax.plot([loc, loc], [min(trace['y']), max(trace['y'])], c='green')
if add_x > 0:
ax.plot([add_x, add_x], [min(trace['y']), max(trace['y'])], c='orange')
if len(predxlocs) > 0:
for loc in predxlocs:
ax.plot([loc, loc], [min(trace['y']), max(trace['y'])], c='red')
ax.set_ylabel(r'$oD_{254}$')
plt.show()
def fit_peaks(peakxs, grad_end, mode='mammal'):
#construct a dictionary of relative LSU/SSU MW data for different organisms
LSU_SSU_data = {'mammal':0.37, 'yeast':0.34}
if type(mode) == str:
LSU_SSU = LSU_SSU_data[mode]
else:
LSU_SSU = mode
#construct a list with the molecular weights of consecutive polysomes
# (starting with 40S, 60S, monosome,disome,...)
polyno = [LSU_SSU, 1 - LSU_SSU] + list(range(1, len(peakxs)-1))
#define a logarithmic function to fit the peak x values to
def logfunc(x, a, b, c):
return a*np.log(b * x + c)
#fit the parameters
params, params_covariance = curve_fit(logfunc, polyno, peakxs)
#calculate the fitted position of peaks following the observed peaks
xfit = np.arange(max(polyno)+1, 50)
peakfit = logfunc(xfit, params[0], params[1], params[2])
peakfit = peakfit[peakfit < grad_end]
xfun = np.linspace(0.33, len(peakxs)+len(peakfit), 150)
peakfun = logfunc(xfun, params[0], params[1], params[2])
return [peakfit, [xfun, peakfun]]
def fp2poly(df, df_columns=['ORF', 'Ribo_Prints', 'RNA_Prints'], RNA_content=60000,
ribo_content=200000, include_idle=True, frac_act=0.85, frac_split=0.30, poly_limit=30,
remove_spurious_RNAs=True,):
"""Calculates peak volumes of a polysome profile corresponding to
an input footprinting dataset."""
#####assemble a processible dataset
##if non-default columns are specified, check whether columns exist:
if df_columns != ['ORF', 'Ribo_Prints', 'RNA_Prints']:
if set(df_columns).issubset(df.columns):
pass
else:
print('Specified columns do not exist in data frame. Available columns include: \n' +
df.columns)
return
elif 'RNA_Prints' in df.columns:
dats = df[df_columns]
else:
dats = df[df_columns[:2]]
#check whether reference RNA-Seq data need to be used
if len(dats.columns) == 2:
dats.columns = ['ORF', 'Ribo_Prints']
dats = dats.merge(RNA_ref, how='inner', on='ORF')
print('No mRNA data specified, using reference RNA-Seq data.')
elif len(df_columns) == 3:
dats.columns = ['ORF', 'Ribo_Prints', 'RNA_Prints']
else:
print('Incorrect column number. \nfp2poly requires two or three columns in this order: \n' +
'1) gene names, 2) Ribo-Seq counts, 3) (optional) RNA-Seq counts')
return
#remove rows where either the RNA prints or Ribo prints are 0
# (these do not contribute information to the profile)
dats = dats.loc[(dats['RNA_Prints'] > 0) & (dats['Ribo_Prints'] > 0)]
######convert Ribo-Seq reads to RPKM
#combine input dataset with gene length information
dats = dats.merge(genes[['name', 'length']],
how='inner', left_on='ORF', right_on='name')[
['ORF', 'RNA_Prints', 'Ribo_Prints', 'length']]
#calculate RPKM
dats['RNA_RPKM'] = dats['RNA_Prints']/(dats['length']/1000)
#####sort genes into polysome peaks according to RPKM info
#determine conversion factor from Ribo_Prints to no of Ribosomes
RiboPrints2Ribos = (ribo_content * frac_act) / sum(dats['Ribo_Prints'])
dats['Ribos_bound'] = dats['Ribo_Prints'] * RiboPrints2Ribos
#determine conversion factor from RNA_RPKM to no of RNAs
RNARPKM2RNAs = RNA_content / sum(dats['RNA_RPKM'])
dats['RNAs_per_cell'] = dats['RNA_RPKM'] * RNARPKM2RNAs
#calculate the ribosome load per RNA (RperR)
dats['RperR'] = dats['Ribos_bound'] / dats['RNAs_per_cell']
dats = dats.dropna()
#remove rows where the number of ribosomes per RNA is > poly_limit
dats = dats.loc[dats['RperR'] <= poly_limit]
#remove spurious RNAs (< than 0.05 RNAs per cell)
if remove_spurious_RNAs:
dats = dats.loc[dats['RNAs_per_cell'] > 0.05]
######assign RNAs into polysome peaks
#make an array to hold the relative weights for each polysome class
poly_array = np.zeros(poly_limit+2)
#if indicated, assign idle ribosomes to the first three peaks,
# based on the fraction of active ribosomes and the fraction of split inactive ribosomes
if include_idle:
idle_ribos = (1 - frac_act) * 200000
poly_array[0] += idle_ribos * frac_split * 0.34
poly_array[1] += idle_ribos * frac_split * 0.66
poly_array[2] += idle_ribos * (1-frac_split)
#go through each row of dats and add ribosomes to the appropriate peak
for _, row in dats.iterrows():
this_RperR = row['RperR']
these_Ribos_bound = row['Ribos_bound']
#if the number of ribos per RNA is an exact number,
# assign the ribos to the corrresponding peak
floor_val = int(this_RperR)
if float(floor_val) == this_RperR:
poly_array[floor_val + 1] += these_Ribos_bound
#if the number of ribos is between two integers,
# split the ribos proportionally between the two adjacent peaks
#for example, for 5.6 Ribos per RNA 60% of ribosomes go to the 6-some peak,
#40% to the 5-some peak
else:
ceil_weight = (this_RperR-floor_val)
floor_weight = 1 - ceil_weight
if floor_val != 0:
poly_array[floor_val + 1] += these_Ribos_bound * floor_weight
poly_array[floor_val + 2] += these_Ribos_bound * ceil_weight
#normalise values to total signal
poly_array = poly_array/sum(poly_array)
return poly_array
def plot_poly(peak_vols):
"""
Returns x and y coordinates of a polysome profile from a list of peak volumes
computed by fp2poly
"""
#define the function for returning a normal distribution centred around mu with variance
# (=peak width) sigma
def normpdf(x, mu, sigma):
return 1/(np.sqrt(2 * np.pi * sigma ** 2)) * np.exp(-1 * ((x - mu) ** 2 / (2 * sigma ** 2)))
#define the relative molecular weights corresponding to the different
# peaks (40S, 60S, 1-,2-,3-,... some)
peak_ids = [0.34, 0.66] + list(range(1, len(peak_vols)-1))
#calculate a series of peak locations based on a typical polysome profile
peak_locs = [0.15*np.log(19 * x -0.6) for x in peak_ids]
#calculate a series of peak widths based on a typical polysome profile
peak_widths = [-0.0004 * x + 0.015 for x in peak_ids]
#definine the oD drift and the initial peak based on a typical polysome profile
x = np.linspace(0, 1, num=400)
drift = -0.07 + 0.19 * x + 0.05
debris = np.exp((-x + 0.085) * 20) * 1.2
#construct the plot of the polysome profile
sum_trace = drift + debris
for peak_no in range(len(peak_locs)):
this_peak = normpdf(x, peak_locs[peak_no], peak_widths[peak_no]) * peak_vols[peak_no]
sum_trace += this_peak
return x, sum_trace
def compare_profiles(dats1, dats2, dats1_columns=['ORF', 'RNA_Prints', 'Ribo_Prints'],
dats2_columns=['ORF', 'RNA_Prints', 'Ribo_Prints'],
colors=['royalblue', 'gold'], conditions=['Cond. 1', 'Cond. 2'],
return_df=False):
"""
Computes and displays the predominant movements of transcripts between polysome peaks
for two conditions.
"""
def counts_to_RPKM(dats):
#read in the gene length info dataset
genes = pd.read_csv('Data/sacCer3 genes.csv')
#combine input dataset with gene length information
dats = dats.merge(genes[['name', 'length']], how='inner', left_on='ORF', right_on='name')[
['ORF', 'RNA_Prints', 'Ribo_Prints', 'length']]
#calculate RPKM
dats['RNA_RPKM'] = dats['RNA_Prints']/(dats['length']/1000)
dats = dats.drop(['RNA_Prints', 'length'], axis=1)
return dats
def calc_RperR(Rdats, RNA_content=60000, ribo_content=200000, frac_act=0.85, poly_limit=30):
#determine conversion factor from Ribo_Prints to no of Ribosomes
RiboPrints2Ribos = (ribo_content * frac_act) / sum(Rdats['Ribo_Prints'])
Rdats['Ribos_bound'] = Rdats['Ribo_Prints']*RiboPrints2Ribos
#determine conversion factor from RNA_RPKM to no of RNAs
RNARPKM2RNAs = RNA_content / sum(Rdats['RNA_RPKM'])
Rdats['RNAs_per_cell'] = Rdats['RNA_RPKM']*RNARPKM2RNAs
#calculate the ribosome load per RNA (RperR)
Rdats['RperR'] = np.round(Rdats['Ribos_bound'] / Rdats['RNAs_per_cell'])
Rdats = Rdats.dropna()
#remove rows where the number of ribosomes per RNA is > poly_limit
Rdats = Rdats.loc[Rdats['RperR'] <= poly_limit]
return Rdats
#prepare datasets and compute RperR values via RNA RPKM values
dats1 = dats1[dats1_columns]
dats1.columns = ['ORF', 'RNA_Prints', 'Ribo_Prints']
dats1 = counts_to_RPKM(dats1)
dats1 = calc_RperR(dats1)
dats2 = dats2[dats2_columns]
dats2.columns = ['ORF', 'RNA_Prints', 'Ribo_Prints']
dats2 = counts_to_RPKM(dats2)
dats2 = calc_RperR(dats2)
#compute column movements for individual transcripts
fromto = dats1[['ORF', 'RperR']].merge(dats2[['ORF', 'RperR']], how='inner', on='ORF')
fromto.columns = ['ORF', 'from', 'to']
#calculate main destinations for each origin peak
from_unique = fromto['from'].unique()
from_unique.sort()
from_vec, to_vec, dir_vec, alpha_vec = [], [], [], []
for from_idx in range(len(from_unique)):
this_from = fromto.loc[fromto['from'] == from_unique[from_idx]]
this_to_unique = this_from['to'].unique()
for to_idx in range(len(this_to_unique)):
from_vec.append(from_unique[from_idx])
to_vec.append(this_to_unique[to_idx])
if from_unique[from_idx] < this_to_unique[to_idx]:
dir_vec.append('up')
elif from_unique[from_idx] > this_to_unique[to_idx]:
dir_vec.append('down')
else:
dir_vec.append('nc')
alpha_vec.append(np.log(
this_from.loc[this_from['to'] == this_to_unique[to_idx]].shape[0]))
alpha_vec_scaled = [(alpha)/(max(alpha_vec)*0.7) for alpha in alpha_vec]
df = pd.DataFrame({'From':from_vec, 'To':to_vec, 'Direction':dir_vec, 'Alpha':alpha_vec_scaled})
up_df = df.loc[df['Direction'] == 'up']
down_df = df.loc[df['Direction'] == 'down']
#prepare main figure
fig = plt.figure(constrained_layout=True, figsize=(5, 3))
gs = fig.add_gridspec(2, 6)
ax1 = fig.add_subplot(gs[0, :-2])
ax2 = fig.add_subplot(gs[1, :-2])
ax3 = fig.add_subplot(gs[0:, -2:])
ax3.axis('off')
for idx in range(up_df.shape[0]):
ax1.plot([up_df.iloc[idx]['From'], up_df.iloc[idx]['To']], [3, 1], linewidth=4,
color=colors[0], alpha=up_df.iloc[idx]['Alpha'])
ax1.set_ylim(0.9, 3.1)
ax1.set_yticks([3, 1])
ax1.set_yticklabels([conditions[0], conditions[1]])
ax1.set_xlim((0, 30))
for idx in range(down_df.shape[0]):
ax2.plot([down_df.iloc[idx]['From'], down_df.iloc[idx]['To']], [3, 1],
linewidth=4, color=colors[1], alpha=down_df.iloc[idx]['Alpha'])
ax2.set_ylim(0.9, 3.1)
ax2.set_yticks([3, 1])
ax2.set_yticklabels([conditions[0], conditions[1]])
ax2.set_xlabel('Polysome number')
ax2.set_xlim((0, 30))
#prepare legend for line width
ax3.set_ylim((0, 10))
ax3.set_xlim((0, 10))
ax3.text(8.5, 5, 'Number of mRNAs', verticalalignment='center', rotation=90)
ref_alphas = [0.1, 0.4, 0.7]
ref_values = [np.exp(max(alpha_vec)/10), np.exp(max(alpha_vec)/2), np.exp(max(alpha_vec))]
y_pos = [3.5, 5, 6.5]
for idx in [0, 1, 2]:
ax3.plot([0.5, 3.5], [y_pos[idx], y_pos[idx]],
linewidth=4, c='black', alpha=ref_alphas[idx])
ax3.text(4.5, y_pos[idx], str(int(ref_values[idx])), verticalalignment='center')
if return_df:
return fig, fig.axes, df
else:
return fig, fig.axes