-
Notifications
You must be signed in to change notification settings - Fork 0
/
exonsToFakeReads.py
140 lines (114 loc) · 4.51 KB
/
exonsToFakeReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
# encoding: utf-8
"""
exonsToFakeReads.py
Created by Brant Faircloth on 2010-03-09.
Copyright (c) 2010 Brant Faircloth. All rights reserved.
For the tetraodon data, we now know what the exons are that likely make up
the genes of the organism. But, we need to smash these exons together into
mRNA-like reads, so we can map them over to Danio, in order to get some idea
of the genes with which we will be working. This program does that.
Usage:
python exonsToFakeReads.py --twobit=data/tetNig2/tetNig2.2bit
--configuration=data/tetNig2/db.conf
--exons=exons --genes=genes --output=data/tetNig2/myExons.fa
"""
import pdb
import os
import sys
import oursql
import bx.seq.twobit
import ConfigParser
import optparse
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
def interface():
'''Get the starting parameters from a configuration file'''
usage = "usage: %prog [options]"
p = optparse.OptionParser(usage)
p.add_option('--twobit', dest = 'twobit', action='store', \
type='string', default = None, \
help='The path to the 2bit file.', metavar='FILE')
p.add_option('--configuration', dest = 'conf', action='store', \
type='string', default = None, \
help='The path to the configuration file.')
p.add_option('--exons', dest = 'exons', action='store', \
type='string', default = None, \
help='The table containing the exons.')
p.add_option('--genes', dest = 'genes', action='store', \
type='string', default = None, \
help='The table containing the genes.')
p.add_option('--output', dest = 'output', action='store', \
type='string', default = None, \
help='The path to the output file.', metavar='FILE')
p.add_option('--stitch', dest = 'stitch', action='store_true', \
default=False, help='Stitch exons into gene regions')
(options,arg) = p.parse_args()
if not options.conf:
p.print_help()
sys.exit(2)
return options, arg
def getExons(cur, mrna_id, options):
'''get the exons associated with a particular gene region'''
query = '''SELECT starts_exon, chromo, start, end from %s where \
mrna_id = %s order by start ASC''' % (options.exons, mrna_id)
cur.execute(query)
return cur.fetchall()
def exonStitcher(cur, gene, mrna_id, exons, tb):
'''for a given gene, stitch the sequence of the exons together into a
cumulative whole that represents a quasi-mRNA'''
sequence = None
exonIds = ''
for exon in exons:
exonIds += '%s,' % exon[0]
se, chromo, start, end = exon
if not sequence:
sequence = Seq(tb[chromo][start:end], IUPAC.unambiguous_dna)
else:
sequence += Seq(tb[chromo][start:end], IUPAC.unambiguous_dna)
record = SeqRecord(sequence)
record.id = 'Tetraodon_Gene_%s' % (gene)
record.name = record.id
record.description = 'Tetraodon putative gene %s, mrna_id = %s, exons %s' % (gene, mrna_id, exonIds)
return [record]
def exonOnly(cur, gene, mrna_id, exons, tb):
#pdb.set_trace()
records = []
for count, exon in enumerate(exons):
sequence = None
se, chromo, start, end = exon
sequence = Seq(tb[chromo][start:end], IUPAC.unambiguous_dna)
record = SeqRecord(sequence)
record.id = 'Tetraodon_Gene_%s_Exon_%s_mRNA_%s' % (gene, exon[0], mrna_id)
record.name = record.id
record.description = 'Tetraodon putative gene %s, exon %s, mrna_id = %s' % (gene, exon[0], mrna_id)
records.append(record)
return records
def main():
conf = ConfigParser.ConfigParser()
options, arg = interface()
conf.read(options.conf)
conn = oursql.connect(
user=conf.get('Database','USER'),
passwd=conf.get('Database','PASSWORD'),
db=conf.get('Database','DATABASE')
)
cur = conn.cursor()
tb = bx.seq.twobit.TwoBitFile(file(os.path.abspath(options.twobit)))
# get list of genes
query = '''SELECT id, mrna_id from %s''' % options.genes
cur.execute(query)
genes = cur.fetchall()
holder = []
for gene, mrna_id in genes:
exons = getExons(cur, mrna_id, options)
if options.stitch:
rna = exonStitcher(cur, gene, mrna_id, exons, tb)
else:
rna = exonOnly(cur, gene, mrna_id, exons, tb)
holder += rna
SeqIO.write(holder, open(options.output, 'w'), 'fasta')
if __name__ == '__main__':
main()