-
Notifications
You must be signed in to change notification settings - Fork 4
/
GenbankToFASTAandOrganismTableRow.py
executable file
·207 lines (164 loc) · 7.17 KB
/
GenbankToFASTAandOrganismTableRow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python
"""
Created by: Lee Bergstrand
Description: A program that extracts the proteins annotations from a Genbank file and as well as some
information about the organism in the file. Stores the protein annotations as a Fasta.
Appends organism info to a csv file.
Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download
"""
# Imports & Setup:
import os
import argparse
from lib import *
# ==========
# Functions:
# ==========
# ----------------------------------------------------------------------------------------
def main(args):
input_file_path = args.in_file[0]
organism_info_flag = args.organism_info
annotation_flag = args.annotation
fasta_file_name = os.path.join(os.getcwd(), str(os.path.basename(input_file_path).split('.')[0]) + '.faa')
csv_file_name = os.path.join(os.getcwd(), 'OrganismDB.csv')
check_extension(input_file_path)
seq_records = extract_sequence_records(input_file_path, 'genbank')
for record in seq_records:
if organism_info_flag:
csv_row = get_organism_info(record)
write_csv(csv_row, csv_file_name)
if annotation_flag:
fasta = get_coding_annotation_fasta(record)
write_fasta(fasta, fasta_file_name)
# ----------------------------------------------------------------------------------------
def check_extension(in_path):
"""
Checks input file extension and give a warning if it is incorrect.
:param in_path: The path to the input file.
"""
in_file_extension = os.path.splitext(in_path)[-1]
if not in_file_extension == ".gbk":
print("[Warning] " + in_file_extension + " may not be a Genbank file!")
# ------------------------------------------------------------------------------------------------------------
def get_organism_info(seq_record):
"""
When passed a sequence record object returns a string containing the organisms info.
:param seq_record: A Biopython sequence record object.
:return: A string containing a CSV file row.
"""
print(">> Extracting Organism Info...")
organism_id = seq_record.id
description = seq_record.description.replace(",", "")
if 'plasmid' in description.lower():
accession_type = 'Plasmid'
elif 'chromosome' in description.lower():
accession_type = 'Chromosome'
elif 'genome' in description.lower():
accession_type = 'Chromosome'
else:
accession_type = 'Unknown'
source = seq_record.annotations['source'].replace(",", "")
taxonomy = "_".join(seq_record.annotations['taxonomy'])
organism_genome_length = len(seq_record.seq)
organism_string = organism_id + "," + accession_type + "," + description + "," + source + "," + taxonomy + "," + str(
organism_genome_length) + "\n"
return organism_string
# -------------------------------------------------------------------------------------------------------------
def get_coding_annotation_fasta(seq_record):
"""
When passed a sequence record object returns an array of FASTA strings for each annotation.
:param seq_record: A Biopython sequence record object.
:return: A FASTA file string containing all sequences record object CDS sequence features.
"""
fasta = []
features = seq_record.features # Each sequence has a list (called features) that stores seqFeature objects.
for feature in features: # For each feature on the sequence
if feature.type == "CDS": # CDS means coding sequence (These are the only feature we're interested in)
feat_qualifiers = feature.qualifiers # Each feature contains a dictionary called qualifiers which contains
# data about the sequence feature (for example the translation)
start = int(feature.location.start) # Type-casting to int strips fuzzy < > characters.
end = int(feature.location.end)
strand = feature.location.strand
if strand is None:
strand = "?"
elif int(strand) < 0:
strand = "-"
elif int(strand) > 0:
strand = "+"
else:
strand = "?"
location = "[" + str(start) + ":" + str(end) + "](" + strand + ")"
# Gets the required qualifiers. Uses featQualifiers.get to return the qualifiers or a default value if the qualifiers
# is not found. Calls strip to remove unwanted brackets and ' from qualifiers before storing it as a string.
protein_id = str(feat_qualifiers.get('protein_id', 'no_protein_id')).strip('\'[]')
if protein_id == 'no_protein_id':
continue # Skips the iteration if protein has no id.
protein_locus = str(feat_qualifiers.get('locus_tag', 'no_locus_tag')).strip('\'[]')
gene = str(feat_qualifiers.get('gene', 'no_gene_name')).strip('\'[]')
product = str(feat_qualifiers.get('product', 'no_product_name')).strip('\'[]')
translated_protein = str(feat_qualifiers.get('translation', 'no_translation')).strip('\'[]')
fasta_part_one = ">" + protein_id + " " + gene + "-" + product + " (Locus: " + protein_locus + ")"
fasta_part_two = " (Location: " + location + ")" + "\n" + translated_protein + "\n"
fasta.append(fasta_part_one + fasta_part_two)
fasta_string = "".join(fasta)
return fasta_string
# ----------------------------------------------------------------------------------------
def write_fasta(fasta, out_file_path):
"""
Writes a FASTA file string to file.
:param fasta: Input FASTA formatted string.
:param out_file_path: The path for the output FASTA file.
"""
try:
print(">> Writing fasta File...")
fasta_writer = open(out_file_path, "w")
fasta_writer.write(fasta)
fasta_writer.close()
except IOError:
print("Failed to open " + out_file_path)
sys.exit(1)
# ----------------------------------------------------------------------------------------
def write_csv(organism_string, out_file_path):
"""
Appends new row to the organism data CSV.
:param organism_string: A CSV row containing organism info.
:param out_file_path: The path to the output CSV file to append too.
"""
try:
print(">> Writing to organism info to CSV file...")
writer = open(out_file_path, "a")
writer.write(organism_string)
writer.close()
except IOError:
print("Failed to open " + "OrganismDB.csv")
sys.exit(1)
print(">> Done")
# ----------------------------------------------------------------------------------------
if __name__ == '__main__':
descriptor = """
A program that extracts the proteins annotations from a Genbank file and as well as some
information about the organism in the file. Stores the protein annotations as a Fasta.
Appends a csv file with the organism info.
"""
parser = argparse.ArgumentParser(description=descriptor)
parser.add_argument('-i', '--in_file', metavar='GENBANK', nargs=1, help='''
The input Genbank file.''')
parser.add_argument('-O', '--organism_info', action='store_true', help='''
Flag to write an organism table row.''')
parser.add_argument('-A', '--annotation', action='store_true', help='''
Flag to extract CDS from input file and write them to a FASTA file.''')
cli_args = parser.parse_args()
# At minimum we require an input file, and one CLI flag.
proceed = True
if cli_args.in_file is None:
print("Error: Missing sequence list path...")
proceed = False
if not cli_args.organism_info and not cli_args.annotation:
print("Error: Missing sequence FASTA file path...")
print("Error: Missing query BLAST database path...")
proceed = False
if proceed:
main(cli_args)
else:
print("")
parser.print_help()
print("")