-
Notifications
You must be signed in to change notification settings - Fork 0
/
stopcodon.R
364 lines (325 loc) · 15.3 KB
/
stopcodon.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#!/usr/bin/env Rscript
## Usage: Rscript stopcodon.rscript <treefile.ph> <seqfile.fasta>
## The tree file should include branch lengths. Note that relative branch lengths are treated as fixed.
## The sequence file should contain a codon-aware alignment in fasta format and must include the
## stop codon as the last position of the alignment. Sequences that include a gap or a codon other than
## a stop codon (i.e. sequences for which the stop codon is not positionally homologous with the last
## position in the alignment) are excluded.
## Output: a file called stopcodon.rscript.out, which contains the following: The maximum log likelihood;
## ML estimate of kappa; ML estimate of omega; ML estimate of the treescaling parameter; ML estimate of phi
## delta_lnL (difference in log likelihood compared to a model with phi=1; convergence of optimizer (0 = success,
## 1 = failure)
##Kappa is transition/transversion rate
args = commandArgs(trailingOnly=TRUE)
model = 'MG'
codon_frequencies_model = 'f1x4'
header_trim_len<- 50
#####Requires ape and expm
suppressMessages(require(ape))
suppressMessages(require(expm))
suppressMessages(library(DECIPHER))
suppressMessages(library(Biostrings))
suppressMessages(library(phylotools))
#suppressMessages(library(lmtest))
######Setup########
setup = function(omega,phi,kappa,treefile, seqfile) {
amino_acids = c("F", "L", "S", "Y", "C", "W", "P", "H", "Q", "R", "I", "M", "T", "K", "N", "V", "A", "D", "E", "G","X")
codons = c("ttt", "ttc", "tta", "ttg", "tct", "tcc", "tca", "tcg", "tat", "tac", "tgt", "tgc", "tgg", "ctt", "ctc", "cta", "ctg", "cct", "ccc", "cca", "ccg", "cat", "cac","caa", "cag", "cgt", "cgc", "cga", "cgg", "att", "atc", "ata", "atg", "act", "acc", "aca", "acg", "aat", "aac","aaa", "aag", "agt", "agc", "aga", "agg", "gtt", "gtc","gta", "gtg", "gct", "gcc", "gca", "gcg", "gat", "gac","gaa", "gag", "ggt", "ggc", "gga", "ggg", "tag", "tga","taa")
aa = c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y","C", "C", "W", "L", "L", "L", "L", "P", "P", "P", "P","H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I","M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S","R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D","D", "E", "E", "G", "G", "G", "G", "X", "X", "X")
codon_numbers = list(ttt = 1, ttc = 2, tta = 3, ttg = 4,tct = 5, tcc = 6, tca = 7, tcg = 8, tat = 9, tac = 10,tgt = 11, tgc = 12, tgg = 13, ctt = 14, ctc = 15, cta = 16,ctg = 17, cct = 18, ccc = 19, cca = 20, ccg = 21, cat = 22,cac = 23, caa = 24, cag = 25, cgt = 26, cgc = 27, cga = 28,cgg = 29, att = 30, atc = 31, ata = 32, atg = 33, act = 34,acc = 35, aca = 36, acg = 37, aat = 38, aac = 39, aaa = 40,aag = 41, agt = 42, agc = 43, aga = 44, agg = 45, gtt = 46,gtc = 47, gta = 48, gtg = 49, gct = 50, gcc = 51, gca = 52,gcg = 53, gat = 54, gac = 55, gaa = 56, gag = 57, ggt = 58,ggc = 59, gga = 60, ggg = 61, tag = 62, tga = 63, taa = 64)
nucleotides = list(a = 1, c = 2, g = 3, t = 4)
purine = c(1, 0, 1, 0)
tree = read.tree(tree_file)
tree=reorder(tree,"postorder")
#tree=reorder(tree_file,"postorder")
edges = tree$edge
#seqs = read.dna(seq_file,fo="fasta")
chars = seqfile #as.character(seqs)
if(class(chars)=="list"){
alignment_len<-length(chars[[1]])
data.clean<-matrix(,nrow=length(chars),ncol = alignment_len)
for (i in 1:length(chars)) {
if(length(chars[[i]])==alignment_len){
temp<-t(matrix(unlist(chars[[i]])))
#rownames(temp)<-names(chars[i])
data.clean[i,]<-temp[1,]
}
}
rownames(data.clean)<-names(chars)
chars<-data.clean
}
if(!all(sort(tree$tip.label)==sort(rownames(chars)))){
##Trimming the headers
tree$tip.label=substr(toupper(tree$tip.label),1,header_trim_len)
rownames(chars)=substr(toupper(rownames(chars)),1,header_trim_len)
}
chars = chars[tree$tip.label,]
# if(ncol(chars)%%3==1)
# {
##Shift frames to left/right because they are not in frame (adding a column of gaps at 1st column)
##CHEATING
# chars<-cbind(as.matrix(rep("-",nrow(chars)),nrow=nrow(chars)),as.matrix(rep("-",nrow(chars)),nrow=nrow(chars)),chars)
# }
#set up PI
f1 = c(0, 0, 0, 0)
f2 = c(0, 0, 0, 0)
f3 = c(0, 0, 0, 0)
ff = c(0,0,0,0)
PI = array(rep(0, 64*64), dim = c(64, 64))
PI3x4 = array(rep(0, 64*64), dim = c(64, 64))
PI1x4 = array(rep(0, 64*64), dim = c(64, 64))
s_c=0
#convert to codons - produces cdAlign which is a matrix representing the coding sequence alignment. Each
#row is a sequence, each column is a numeric representation (1-64) of the codon
cdAlign = array(rep(0,nrow(chars)*ncol(chars)/3),c(nrow(chars),ncol(chars)/3))
for(i in 1:nrow(chars)) {
s = paste(chars[i,],collapse="")
for(j in 1:(ncol(chars)/3)) {
ss = substr(s,3*(j-1)+1,3*j)
ss=tolower(ss)
#if(ss!="+++" && ss!="---")
#{print(ss)}
if(!is.null(codon_numbers[[ss]]) && ss!="+++" && ss!="---") {
if(ss=="taa"||ss=="tag"||ss=="tga"){
##counting stop codons ##print(ss)
s_c=s_c+1
}
cdAlign[i,j] = codon_numbers[[ss]]
f1[nucleotides[[substr(ss,1,1)]]] = f1[nucleotides[[substr(ss,1,1)]]] + 1
f2[nucleotides[[substr(ss,2,2)]]] = f2[nucleotides[[substr(ss,2,2)]]] + 1
f3[nucleotides[[substr(ss,3,3)]]] = f3[nucleotides[[substr(ss,3,3)]]] + 1
ff[nucleotides[[substr(ss,1,1)]]] = ff[nucleotides[[substr(ss,1,1)]]] + 1
ff[nucleotides[[substr(ss,2,2)]]] = ff[nucleotides[[substr(ss,2,2)]]] + 1
ff[nucleotides[[substr(ss,3,3)]]] = ff[nucleotides[[substr(ss,3,3)]]] + 1
}
else {
cdAlign[i,j] = NA
}
}
}
#print(c("SC:",s_c,"nSeqs",nrow(chars)))
dimnames(cdAlign)[[1]]=tree$tip.label
##cdAlign cleanup empty/masked rows
mskd_cols<-c()
for(i in 1:ncol(cdAlign)){
if(all(is.na(cdAlign[,i]))){
mskd_cols<-c(mskd_cols,i)
}
}
#mskd_cols=unique(mskd_cols)
##IF mskd_cols is not empty
if(!is.null(mskd_cols)){
cdAlign<-cdAlign[,-mskd_cols]
}
#Frame based proportions of nucleotides(on each position of the frame)?
f1 = f1/sum(f1)
f2 = f2/sum(f2)
f3 = f3/sum(f3)
ff = ff/sum(ff)
##Construct PI based on model (1x4,3x4) (1x4=same rates based on global proption, 3x4=induvidual rates)
for (i in 1:64) {
PI3x4[i, i] = f1[nucleotides[[substr(codons[i], 1, 1)]]] * f2[nucleotides[[substr(codons[i],2, 2)]]]* f3[nucleotides[[substr(codons[i],3, 3)]]]
PI1x4[i, i] = ff[nucleotides[[substr(codons[i], 1, 1)]]] * ff[nucleotides[[substr(codons[i],2, 2)]]]* ff[nucleotides[[substr(codons[i],3, 3)]]]
}
# Toggle f3x4 f1x4
#PI = PI3x4/sum(PI3x4)
PI = PI1x4/sum(PI1x4)
Mg_mult = array(c(rep(0, 4096)), dim = c(64, 64))
###############Set up the generator matrix############
R = array(c(rep(0, 4096)), dim = c(64, 64))
for (i in 1:64) {
for (j in 1:64) {
diffs = 0
for (k in 1:3) {
nuc1 = nucleotides[[substr(codons[i], k, k)]]
nuc2 = nucleotides[[substr(codons[j], k, k)]]
if (nuc1 != nuc2) {
diffs = diffs + 1
if(codon_frequencies_model == 'f1x4') {
Mg_mult[i,j] = ff[nuc2]
} else if(codon_frequencies_model == 'f3x4') {
if(k==1) {Mg_mult[i,j] = f1[nuc2]}
if(k==2) {Mg_mult[i,j] = f2[nuc2]}
if(k==3) {Mg_mult[i,j] = f3[nuc2]}
} else {print("Codon frequencies model undefined")
return(NA)
}
if (purine[nuc1] == purine[nuc2]) {
R[i, j] = kappa
}
else {
R[i,j] = 1
}
if (aa[i] != aa[j]) {
R[i,j] = R[i,j] * omega
}
if(i > 61 & j <= 61) { # stop to non stop
R[i,j] = 0
}
if(i <= 61 & j > 61) { # non stop to stop
R[i,j] = 0
}
if(i > 61 & j > 61 & i != j) {
R[i,j] = phi * R[i,j]
}
}
}
if (diffs > 1) {
R[i, j] = 0
}
}
}
#model defines instantaneous rate matrix (or TPMs?) [more info: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4379412/]
##GY- target frequencies in TPM, MG- target frequencies in PI
##GY- target frequencies are suseptible to change, MG- frequencies are constant
##MG-style models; the first, MG1, employs four parameters for nucleotide frequencies (one per nucleotide; [Muse and Gaut 1994](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4379412/#msv003-B43))
##
##dN/dS-based model matrices account for nucleotide mutational bias by incorporating either target codon (Goldman and Yang 1994) or target nucleotide (Muse and Gaut 1994) frequencies; these frameworks are known, respectively, as Goldman–Yang (GY)-style and Muse–Gaut (MG)-style models (Kosakovsky Pond et al. 2010). For example, the instantaneous rate matrix element giving the substitution probability from codon AAA to AAG would contain the target codon frequency PAAG in GY-style models but the target nucleotide frequency πG in MG-style models
##GY-style matrices may be expressed in the framework of the GTR model, in which the instantaneous matrix Q can be decomposed into a 61 × 61 symmetric substitution rate matrix and a 61-dimensional vector containing the equilibrium codon frequencies. The latter corresponds to the stationary distribution of the Markov chain. In contrast, MG-style rate matrices are written in terms of nucleotide frequencies rather than codon frequencies. Therefore, whether these models fit into the GTR framework is unclear a priori. We now describe how the MG-style matrix can be rewritten in terms of a symmetric matrix and a vector of equilibrium codon frequencies, thus demonstrating that these matrices also fit into the GTR(General time reversible) framework.
##NOTE:ASK CATHAL ABOUT THIS PART OF CODE
if(model == 'MG') {
R = R * Mg_mult
} else if(model == 'GY') {
R = R %*% PI
} else {print("Model undefined")
return(NA)
}
#Scale the generator matrix
scale_fac = sum(PI%*%R)
##Normalize it so that it doesn't exceed 1
R = (1/scale_fac)*R
##Re-estimate generator matrix
##Sum each codon row without the column corresponding to the same codon an negate it
##Probability of not getting that codons is the reason for negating the sum? is it shorthand way of writing , p(codon) = -(codon occuring - codon not occuring)
for(i in 1:64) {
R[i,i] = -sum(R[i,-i])
}
return(list(edges=edges,tree=tree,R=R,cdAlign=cdAlign,PI=PI))
}
#####Felsenstein pruning algorithm
felsen=function(pos, tree, node, cdAlign, TPM) {
edges = tree$edge
prod = t(t(rep(1,64)))
##We see which edges are connected to the node
s = which(edges[,1] == node)
for(k in s) {
if(edges[k,2] <= length(tree$tip.label)){
if(!is.null(dim(cdAlign))) {
if(!is.na(cdAlign[tree$tip.label[edges[k,2]],pos])) {
prod = prod * t(t(TPM[k,,cdAlign[tree$tip.label[edges[k,2]],pos]]))
}
}
else {
if(!is.na(cdAlign[tree$tip.label[edges[k,2]]])) {
prod = prod * t(t(TPM[k,,cdAlign[tree$tip.label[edges[k,2]]]]))
}
}
}
else {
prod = prod * TPM[k,,]%*%felsen(pos,tree,edges[k,2],cdAlign, TPM)
}
}
return(prod)
}
########End of setup########
#Takes parameters: kappa, omega, treescale, phi
lik_fun = function(pars,treefile,seqfile,phifixed=0) {
kappa = pars[1]
omega = pars[2]
treescale = pars[3]
#phi = pars[4]
if(phifixed==1) {
phi = 1
} else {
phi = pars[4]
}
if(phi < 0 | omega < 0 | kappa < 0 | treescale < 0) {
return(NA)
}
plot_ll<<-c()
out = setup(omega,phi,kappa,treefile,seqfile)
edges = out$edges
tree = out$tree
##cdAlign returning all NAs
cdAlign = out$cdAlign
R = out$R
PI = out$PI
##N sequences have 2N edges. we get the probability of each edge as a TPM, ie, the probability of edge transitioning and transforming into another sequence.
TPM = array(rep(0,64*64*nrow(edges)),dim=c(nrow(edges),64,64))
##R is crashing here
for(i in 1:nrow(edges)){
##TPM is the probability of transition of each codon in the column of all sequences mutating to another codon. there are 64 slices, one slice for each codon. (why do we take edge though?)
TPM[i,,] = expm(treescale*tree$edge.length[i]*R)
}
likelihood = 0
for(i in 1:ncol(cdAlign)) {
##NAs produced qhen i=4 and trying to add cdAlign & TPM doesn't work(non conformable?), pruning works
##NODE=edges[nrow(edges),1],(why do we take only the last row of edges, which corresponds to the longest edge), guess we prune the longest branch
likelihood = likelihood + log(sum(diag(PI)*felsen(i, tree, edges[nrow(edges),1], cdAlign,TPM)))
#print(likelihood)
plot_ll<<-c(plot_ll,likelihood)
#if(is.infinite(likelihood)==TRUE){
# print(i)
# break ##Remove this
#}
}
#count=count+1
print(c(pars,likelihood))
return(likelihood)
}
##ENTRYPOINT
#MAKE SURE TO CHANGE '!'/'?' to N before proceeding
#tree_file = "cathal.ph" #args[1]
#seq_file = "cathal.fasta" #args[2] "../fasta/temp/3228.tree.newick" "../fasta/temp/3228.NT.fasta"
tree_file = args[2]
seq_file = args[1]
cluster_name=args[3]
#seq_file = "../fasta/PHG-01950.clean_NT.fasta" #args[2]
#aa_aln="../fasta/PHG-01950.clean_AA1.fasta"
#tree_file = "temp.newick" #args[1]
#seq_file = "temp.fasta" #args[2]
out_file = paste(seq_file,".stopout",sep="")
##continued from ALIGN_SEQS.R
##To reverse peptide alignment, use
##revtrans fasta/PHG-01950.clean.fasta fasta/PHG.AA.aln.fasta -match trans > fasta/PHG-01950.clean_NT.fasta
#python trim_fasta_header.py fasta/PHG-01950.clean_NT_NT.fasta 10 > fasta/PHG-01950.clean_NT_NT1.fasta
##Current ENTRYPOINT
plot_ll<<-c()
clean.fasta.name(infile = seq_file,outfile = seq_file)
data.aln<-readDNAMultipleAlignment(seq_file,format = "fasta")
d <- DistanceMatrix(myXStringSet = data.aln@unmasked,correction = "Jukes-Cantor", verbose = FALSE)
MODELS
##MAKE SURE to change '!'/'?' to Ns before proceeding
#ML approach only works for DNA seqs
#clu <- IdClusters(d, method="ML", showPlot = TRUE,model="TN93+G4", myXStringSet=data.aln@unmasked)
#print(clu)
dend<- IdClusters(d, method="NJ", type="dendrogram", showPlot = TRUE)
plot(dend)
str(dend)
#Dont mask because it masks stops
#dna.masked<-MaskAlignment(data.aln@unmasked)
#masks <- lapply(width(colmask(dna.masked)), rep, x="+")
#masks <- unlist(lapply(masks, paste, collapse=""))
#masked_dna <- replaceAt(dna.masked@unmasked, at=IRanges(colmask(dna.masked)), value=masks)
#BrowseSeqs(masked_dna)
#masked_dna<-as.matrix(masked_dna)
masked_dna<-as.matrix(as(data.aln,"DNAStringSet"))
WriteDendrogram(dend,file=tree_file,quoteLabels = FALSE)
##phi=1 is neutral selection
##phi<1 is purifying selection
m0.out = optim(c(2,0.2,1),lik_fun,treefile=tree_file,seqfile=masked_dna,phifixed=1,control=list(fnscale=-1))
print(m0.out)
plot(plot_ll)
#m1.out = optim(c(2,0.2,1,0),lik_fun,treefile=tree_file,seqfile=seq_file,control=list(fnscale=-1))
m1.out = optim(c(m0.out$par,0.5),lik_fun,treefile=tree_file,seqfile=masked_dna,phifixed=0,control=list(fnscale=-1))
print(m1.out)
plot(plot_ll)
m2.out = optim(c(m1.out$par),lik_fun,treefile=tree_file,seqfile=masked_dna,control=list(fnscale=-1))
print(m2.out)
#m3.out = optim(c(m0.out$par,1.5),lik_fun,treefile=tree_file,seqfile=masked_dna,control=list(fnscale=-1))
#print(m3.out)
#lrtest(m2,m1)
diff = m2.out$value - m0.out$value
write(c(m2.out$value,m2.out$par,diff,m2.out$convergence,!(pchisq(2*(diff),df=1,lower.tail = FALSE)<0.05),cluster_name),out_file,ncol=10)
#m4.out = optim(c(m2.out$par),lik_fun,treefile=tree_file,seqfile=masked_dna,control=list(fnscale=-1))
#print(m4.out)