-
Notifications
You must be signed in to change notification settings - Fork 7
/
Ensembl_proteome_manager.py
856 lines (723 loc) · 38.2 KB
/
Ensembl_proteome_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
"""'Ensembl_proteome_manager.py' written by Delan Huang, OHSU, July 2017.
The MIT License (MIT)
Copyright (c) 2017 OHSU
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
Direct questions to:
Technology & Research Collaborations, Oregon Health & Science University,
Ph: 503-494-8200, FAX: 503-494-4729, Email: techmgmt@ohsu.edu.
Delan Huang, 2017-07-12
TODO:
- More Error Checking (adding/dropping when nothing selected, )
- Aesthetic/housekeeping changes to both UI and code
- Overall, program is very rough but functional
"""
# debugging and edits -PW 8/10/2017
# added a little to reporting species without databases. -PW 20200502
# may need to check release in pickle versus current on FTP site
# Built-in module imports
from tkinter import *
from tkinter.ttk import *
from tkinter import messagebox
from tkinter import filedialog
import os
import sys
import ftplib
import re
import urllib.request
import pickle
from datetime import datetime
# Imports dependent on other files
# This python file only uses built-in modules, no external downloads required
try:
import fasta_lib
import Ensembl_fixer
import reverse_fasta
except ImportError:
print("Could not import all files.")
sys.exit()
# Helper Classes
class CheckBoxes(Frame):
"""Creates and packs a set of checkboxes."""
def __init__(self, parent=None, checkboxes=[], side=LEFT):
"""Constructor creates the checkboxes in the checkboxes list."""
Frame.__init__(self, parent)
self.vars = []
for checkbox in checkboxes:
var = IntVar()
check = Checkbutton(self, text=checkbox, variable=var)
check.pack(side=side, fill=X, expand=YES, padx=10)
self.vars.append(var)
def get_state(self):
"""Returns the state of the check boxes"""
return map((lambda var: var.get()), self.vars)
def check_all(self):
"""Sets all check boxes to checked."""
for var in self.vars:
var.set(1)
def uncheck_all(self):
"""Unchecks all checkboxes."""
for var in self.vars:
var.set(0)
class AnimalEntry:
"""Container for Ensembl proteome entries."""
def __init__(self, c_n, l_n, taxid, e_a, acc, g_m, v_d, r_d, p_a):
"""Basic constructor - sets most attributes."""
self.common_name = c_n # Species Common Name (string)
self.latin_name = l_n # Species Latin Name (string)
self.tax_ID = taxid # Taxonomy ID Number (int)
self.ensembl_assembly = e_a # Ensembl assembly (string?)
self.accession = acc # Ensembl accession
self.genebuild_method = g_m # Gene build method
self.variation_database = v_d # Variation database name
self.reg_database = r_d # Regular database
self.pre_assembly = p_a # Pre-assembly information
self.folder_name = "" # Folder Name for each species
self.ftp_file_path = "" # Species ftp download path
def _dump(self):
"""Diagnostic dump"""
print('\ncommon name:', self.common_name)
print('latin name:', self.latin_name)
print('tax ID:', self.tax_ID)
print('assembly:', self.ensembl_assembly)
print('accession:', self.accession)
print('gene build method:', self.genebuild_method)
print('variation DB:', self.variation_database)
print('regular DB:', self.reg_database)
print('pre-assembly:', self.pre_assembly)
print('folder:', self.folder_name)
print('ftp file path:', self.ftp_file_path)
# Build GUI
class GUI:
"""Main GUI class for application."""
def __init__(self, url, prot_path, text, headers, banned_list, script_location, default_contams):
"""Create object and set some state attributes."""
self.url = url # Url of Ensembl FTP site
self.ensembl_prot_path = prot_path # Location of Ensembl databases
self.ensembl_ftp = os.path.dirname(prot_path) # top level where databases are
self.ftp = None # FTP object (set in login method)
self.text = text # HTML text of webpage
self.raw_table = [] # HTML text of just animals table
self.selected_entries = [] # List of selected AnimalEntry objects
self.animal_list = [] # List of all AnimalEntry objects
self.banned_list = banned_list # List of file identifiers to be omitted when downloading
self.release = '' # Ensembl release number
self.version = '' # string like "v89"
self.headers = headers # Needed for columns in tables
self.proteome_IDs = [] # List of unique proteome IDs
self.script_location = script_location # Script path location
self.contams_database = os.path.join(self.script_location, default_contams)
self.selected_default = os.path.join(script_location, 'default_Ensembl_species.txt') # typical default species file path
self.data = None # Holds unpickled information saved from last session
self.quit_save_state = "not triggered" # Trigger for updating defaults file on quit status
# List of characters that cannot be in folder names
self.illegal_characters = r"[\\#%&{}/<>*?:]"
# Helper Class Functions
# FTP support
def login(self):
"""Open an FTP connection and login."""
self.ftp = ftplib.FTP()
self.ftp.connect(str(self.url))
self.ftp.login()
def logout(self):
"""Close the FTP connection."""
try:
self.ftp.quit()
except:
pass # we will get error if there is no FTP connection to close
# some parsing support
def clean_common_name(self, name):
"""Removes some odd characters from common names."""
p = re.compile(r"alt=\"(.*?)\"")
m = p.search(name)
return m.groups()[0]
def clean_latin_name(self, name):
"""Removes some odd characters from latin names."""
p = re.compile(r"<i\b[^>]*>(.*?)</i>")
m = p.search(name)
return m.groups()[0]
def create_raw_table(self):
"""Finds table boundaries in the HTML page."""
# Setup html file to find required information
# Find start and end of h3 header block
TEXT = self.text
if "<td" in TEXT:
start_ind = TEXT.index("<td")
if "</table>" in TEXT:
end_ind = TEXT.index("</table>")
# Text Block that needs to be parsed
self.raw_table = TEXT[start_ind:end_ind]
# Ensembl Animal Entry support
def load_all_entries(self):
"""Loads Ensembl proteome entries from pickle file.
If file does not exist or file is out-of-date, returns False.
"""
# get the ccontents of current_README file
self.login()
self.ftp.cwd(self.ensembl_ftp) # move into current_README file location
listing = []
self.ftp.retrlines('RETR current_README', listing.append)
# Get the current release version from current_README
for line in listing:
if "Ensembl Release" in line:
items = line.split()
release = int(items[items.index('Release') + 1])
# see if pickle file exists
if not os.path.exists(os.path.join(self.script_location, 'Ensembl_current_release.pickle')):
print('pickle file not present')
self.release = release
self.version = "v{}".format(self.release)
return False
# get data from pickle file
self.data = self.unpickle_entries()
self.release = self.data["Release"]
self.version = "v{}".format(self.release)
# if pickled version matches current database version, then load entries from pickle file
if self.release == release:
self.animal_list = self.data["Entries"]
return True
else:
print('saved release is out-of-date')
self.release = release # set this to the current release version
return False
def parse_raw_table(self):
"""Gets Ensembl proteome entries. Looks for pickle file first and checks if current, if not fetches from web."""
if self.load_all_entries():
return # pickled entries were read in and were current
else:
print('fetching data from web')
# Parse header into animal list
# Need an alternative path for missing entries where gene build method is "import"
parser = re.compile(r'<td\b[^>]*>(.*?)</td>|</span\b[^>]*>(.*?)</span>')
matched_groups = parser.findall(self.raw_table)
parsed = []
for i in range(0, len(matched_groups), 9): # Split 1D list into 2D so that each animal has 9 attributes
animal = matched_groups[i:i+9]
parsed.append(animal)
# We want to remove the empty space produced by alternative path in regex
for animal in parsed:
for i in range(len(animal)):
for path in animal[i]:
if path:
animal[i] = path
common_name = self.clean_common_name(animal[0])
latin_name = self.clean_latin_name(animal[1])
tax_id = animal[2]
if not str(tax_id).isdigit(): # In case tax_id is something other than a number
tax_id = "000"
# Create main animal entry
animal_obj = AnimalEntry(common_name, latin_name, tax_id, animal[3], animal[4],
animal[5], animal[6], animal[7], animal[8])
# Set animal object's folder name (ftp download path is set in remove_invalid_animals method)
folder_name = "{}__{}__{}".format(animal_obj.common_name, animal_obj.latin_name, animal_obj.tax_ID)
folder_name = re.sub(self.illegal_characters, "_", folder_name)
animal_obj.folder_name = folder_name
# save animal record
self.animal_list.append(animal_obj)
self.remove_invalid_animals() # FTP paths are set in this method
# save the fetched species information
self.pickle_entries()
def remove_invalid_animals(self):
"""Make sure animals in species table have actual FTP links."""
# if we cant find the animal directory, remove it from animal list
del_list = []
actual_list = self.get_animal_directory() # get list of FTP folders
actual_set = set(actual_list)
for i, animal in enumerate(self.animal_list):
if animal.ensembl_assembly == '-': # no ftp if no assembly?
print('no FTP files:', animal.common_name)
del_list.append(i)
else:
test_name = animal.latin_name.lower().replace(" ", "_")
if test_name not in actual_set:
match = self.double_check_animal(test_name, actual_list)
if match:
download_path = r"{}/{}/pep/".format(self.ensembl_prot_path, match)
animal.ftp_file_path = download_path
else:
print('unknown animal:', animal.common_name)
del_list.append(i)
else:
download_path = r"{}/{}/pep/".format(self.ensembl_prot_path, test_name)
animal.ftp_file_path = download_path
# delete list items (work backwards)
print('starting animal list length:', len(self.animal_list))
del_list = del_list[::-1]
for i in del_list:
del(self.animal_list[i])
print('ending animal list length:', len(self.animal_list))
def double_check_animal(self, test_name, actual_list):
"""Latin species names in table do not always match FTP folder names (gorilla and dog)"""
test_set = set(test_name.split('_'))
for actual in actual_list:
actual_set = set(actual.split('_'))
if (actual_set < test_set) or (actual_set == test_set):
return actual
else:
return None
def get_animal_directory(self):
"""Get list of folder names from the FTP site"""
self.login()
self.ftp.cwd(self.ensembl_prot_path)
listing = []
self.ftp.retrlines('LIST', listing.append) # get list of folders
listing = [x.split()[-1].strip() for x in listing]
return listing
def pickle_entries(self):
"""Saves full left display list to make subsequent launches faster."""
text = {"Release": self.release, "Entries": self.animal_list}
# make sure we are in the location with the script
try:
os.chdir(self.script_location)
except OSError:
print("OSError occurred during pickling. Cwd: {}".format(os.getcwd()))
with open('Ensembl_current_release.pickle', 'wb') as file:
pickle.dump(text, file)
def unpickle_entries(self):
"""Loads saved full left display list of species."""
with open('Ensembl_current_release.pickle', 'rb') as file:
return pickle.load(file)
# list management functions
def filter_entries(self):
"""Checks values search fields, filters all animals associated with
taxon numbers, and/or species names, then returns a list with all matching entries.
"""
# get the species and taxonomy substring filters
species_entry = self.search_species.get().lower()
tax_entry = self.search_tax.get()
# filter on taxonomy number substring
self.selected_entries = [entry for entry in self.animal_list if tax_entry in entry.tax_ID]
# filter on species name substring
self.selected_entries = [entry for entry in self.selected_entries
if species_entry in entry.common_name.lower()
or species_entry in entry.latin_name.lower()]
def get_filtered_proteome_list(self):
"""Calls relevant methods to create filtered lists, then finds intersection of the lists,
and outputs relevant info to user
"""
self.filter_entries()
if len(self.selected_entries) == 0:
# Ask if user wants all entries shown if no filters are selected
answer = messagebox.askyesno("Are you sure?",
"No databases found. Would you like to show all databases?")
if answer:
self.selected_entries = self.animal_list
else:
return None
# Only show relevant info to user in entries
entries = [[entry.common_name, entry.latin_name,
entry.tax_ID, entry.ensembl_assembly]
for entry in self.selected_entries]
# clear entries before importing
for row in self.tree_left.get_children():
self.tree_left.delete(row)
for entry in sorted(entries):
self.tree_left.insert('', 'end', values=entry)
self.update_status_bar("List updated with %s entries" % len(self.selected_entries))
def reset_filters(self):
"""Resets filters to defaults."""
self.search_species.delete(0, END)
self.search_tax.delete(0, END)
self.reverse_contams.uncheck_all()
self.get_filtered_proteome_list()
def browse_contams(self):
"""Dialog to browse to non-default contaminants database."""
self.contams_database = fasta_lib.get_file(self.script_location,
[('Fasta files', '*.fasta')],
"Select a contaminants FASTA file")
self.contams_label.config(text=os.path.split(self.contams_database)[1])
def sort_text_column(self, tv, col, reverse=False):
"""Sorts entries in treeview tables alphabetically."""
l = [(tv.set(k, col), k) for k in tv.get_children('')]
l.sort(key=lambda x: x[0].lower(), reverse=reverse)
# rearrange items in sorted positions
for index, (val, k) in enumerate(l):
tv.move(k, '', index)
# reverse sort next time
tv.heading(col, command=lambda col_=col: self.sort_text_column(tv, col_, not reverse))
def sort_num_column(self, tv, col, reverse=False):
"""Sorts entries in treeview tables numerically."""
l = [(tv.set(k, col), k) for k in tv.get_children('')]
l.sort(key=lambda x: int(x[0]), reverse=reverse)
# rearrange items in sorted positions
for index, (val, k) in enumerate(l):
tv.move(k, '', index)
# reverse sort next time
tv.heading(col, command=lambda col_=col: self.sort_num_column(tv, col_, not reverse))
def drop_from_right(self):
"""Movies entry(ies) from right treeview to left."""
selection = self.tree_right.selection() # creates sets with elements "I001", etc.
for selected in selection:
selected_copy = self.tree_right.item(selected)
self.tree_right.delete(selected)
try:
self.update_status_bar("{} dropped".format(selected_copy['values'][0]))
except UnboundLocalError:
print("User tried to remove a proteome even though none was selected!")
def copy_to_right(self):
"""Movies entry(ies) from left treeview to right."""
selection = self.tree_left.selection()
right_tree_data = [self.tree_right.item(x) for x in self.tree_right.get_children()] # contents of right rows
for selected in selection:
selected_copy = self.tree_left.item(selected) # contents of left selection
if not selected_copy in right_tree_data:
self.tree_right.insert('', 'end', values=selected_copy['values'])
try:
self.update_status_bar("{} added".format(selected_copy['values'][0])) # Species name should be first
except UnboundLocalError:
print("User tried to add a proteome even though none was selected!")
# loading and saving species list function
def save_defaults(self, overwrite=False):
"""Saves species in the right display box to a default species text file"""
desired_file = self.selected_default
if not overwrite:
desired_file = fasta_lib.save_file(self.script_location, [('Text files', '*.txt')],
default_file=os.path.split(self.selected_default)[1],
title_string='Specify a default species file name')
if desired_file:
try:
# write default species list to file
items = self.tree_right.get_children()
databases = [self.tree_right.item(item)['values'] for item in items]
for database in databases:
database[-1] = database[-1].rstrip(r"""\'"*""") # seem to accumulate EOL characters
# Remove duplicates
db_set = set(tuple(x) for x in databases)
databases = sorted([list(x) for x in db_set], key=lambda y: y[0]) # sort DBs by common name
with open(desired_file, "w") as defaults_txt:
self.selected_default = desired_file
for database in databases:
defaults_txt.write("{}\n".format(database))
self.status_bar.config(text="Databases saved to species text file")
except OSError:
messagebox.showwarning("Invalid Filename!", "Cannot save species list to selected folder!")
def select_defaults_and_load(self):
"""Let user browse to a defaults file and load the species."""
self.selected_default = fasta_lib.get_file(self.script_location,
[('Text files', '*.txt')],
'Select a default Ensembl species list file')
self.load_defaults()
def load_defaults(self, display=True):
"""Loads right species list from file."""
try:
with open(self.selected_default, "r") as defaults_txt:
databases = defaults_txt.readlines()
self.status_bar.config(text="default species list imported.")
except FileNotFoundError:
self.update_status_bar("No defaults imported/defaults could not be found")
return None
except OSError:
messagebox.showwarning("Invalid File!", "Invalid file selection!")
return None
except TypeError:
self.update_status_bar("No defaults imported/defaults could not be found")
return None
# Clear selected databases before importing
if display:
for row in self.tree_right.get_children():
self.tree_right.delete(row)
loaded_databases = []
for database in databases:
# load the right list from the defaults
database = database[1:-1] # trim square brackets
common_name = database.split(', ')[0][1:-1] # trim quotes
latin_name = database.split(', ')[1][1:-1] # trim quotes
tax_id = int(database.split(', ')[2])
e_a = database.split(', ')[3][1:-1]
e_a = e_a.rstrip(r"""'\"*""") # seem to get extra chracters to remove
loaded_databases.append([common_name, latin_name, tax_id, e_a])
loaded_databases = sorted(loaded_databases, key=lambda x: x[0]) # sort DBs by common name
if display:
for database in loaded_databases:
self.tree_right.insert('', 'end', values=database)
return loaded_databases
def update_defaults(self):
"""If the entries in right tree do not match original defaults file, ask user to save updated list"""
right_tree_items = [self.tree_right.item(entry)['values'] for entry in self.tree_right.get_children()]
# Remove duplicates
db_set = set(tuple(x) for x in right_tree_items)
right_tree_items = sorted([list(x) for x in db_set], key=lambda y: y[0]) # sort DBs by common name
# compare current right-side databases to stored defaults
if right_tree_items != self.load_defaults(display=False):
if os.path.exists(self.selected_default):
answer = messagebox.askyesno("Unsaved Progress",
"Right species list differs from defaults! Would you like to save?")
if answer:
self.quit_save_state = True
self.save_defaults(overwrite=True)
else:
answer = messagebox.askyesno("Unsaved Progress",
"Save right species list for next time?")
if answer:
self.quit_save_state = True
self.save_defaults(overwrite=True)
def download_databases(self):
"""Fetches the database files for the selected species."""
self.login() # refresh the FTP connection
# throw warning if no databases selected
if len(self.tree_right.get_children()) == 0:
messagebox.showwarning("Empty Selection", "No databases were selected for download!")
return None # exit function
# get parent folder location for database download
db_default = os.getcwd()
self.abs_dl_path = filedialog.askdirectory(parent=self.root, initialdir=db_default,
title='Select container for Ensembl downloads')
if not self.abs_dl_path:
return None
# Make a separate folder to contain all files
ensembl_dir_name = r"Ensembl_v{}".format(self.release)
ensembl_dir_path = os.path.join(self.abs_dl_path, ensembl_dir_name)
try:
os.mkdir(ensembl_dir_path)
except FileExistsError:
pass
os.chdir(ensembl_dir_path)
# Grab entries from right tree view
download_common_names = [self.tree_right.item(entry)['values'][0] for entry in self.tree_right.get_children()]
download_taxid = [self.tree_right.item(entry)['values'][2] for entry in self.tree_right.get_children()]
download_tuples = list(zip(download_common_names, download_taxid))
set_download_tuples = list(set(download_tuples))
if len(download_tuples) != len(set_download_tuples):
messagebox.showwarning("Duplicates found!", "Duplicate databases were selected and will be ignored!")
# Create a list of selected animal objects from list of tax id's selected
download_entries = [entry for _tuple in download_tuples for entry in self.animal_list
if (int(_tuple[1]) == int(entry.tax_ID)) and
(_tuple[0] == entry.common_name)]
# Change ftp directory for each species
for entry in download_entries:
self.ftp.cwd(entry.ftp_file_path)
# Create a folder for each species
download_folder = os.path.join(ensembl_dir_path, entry.folder_name)
try:
os.mkdir(download_folder)
os.chdir(download_folder)
except FileExistsError:
os.chdir(download_folder)
# Create a list of all files in each species folder
listing = []
self.ftp.retrlines('LIST', listing.append)
# Download each selected entry's fasta file
for line in listing:
line = line.strip() # Want last item, so strip EOL
fname = line.split()[-1] # Get the file name
# Skip any files that we do not want to download
if self.banned_file(fname):
continue
fixed_fname = "{}__{}__{}".format(self.version, entry.common_name, fname)
fixed_fname = re.sub(self.illegal_characters, "_", fixed_fname)
self.update_status_bar("Downloading {} file".format(fname))
self.ftp.retrbinary('RETR {}'.format(fname), open('{}'.format(fname), 'wb').write)
print("{} is done downloading".format(fname))
os.rename(os.path.join(download_folder, fname),
os.path.join(download_folder, fixed_fname))
self.make_fasta_files(os.path.join(ensembl_dir_path, entry.folder_name, fixed_fname), entry)
messagebox.showinfo("All Downloads Completed!", "Downloads Finished!")
def make_fasta_files(self, file_location, entry):
"""Uncompresses FASTA file, reformats descriptions, and does some analysis.
"""
# analyze and fix descriptions (also uncompresses the file)
new_fasta_file = Ensembl_fixer.main(file_location, up_one=True)
# chdir into correct folder and make sure all file paths are set up correctly
contam_location = self.contams_database
print('contams:', contam_location)
ensembl_dir_name = r"Ensembl_v{}".format(self.release)
os.chdir(os.path.join(self.abs_dl_path, ensembl_dir_name))
# Add forward/reverse/contams, as specified by checkboxes
self.process_databases(new_fasta_file, contam_location)
def process_databases(self, fasta_file, contam_location):
"""Gets selection value from radiobuttons and then passes those values to imported fasta_reverse function.
More documentation on how fasta_reverse works can be found in the reverse_fasta.py file.
"""
reverse_values = list(self.reverse_contams.get_state())
# Initially set everything to false
forward = False
reverse = False
both = False
decoy_contams = reverse_values[0]
target_contams = reverse_values[1]
if decoy_contams:
both = True
if target_contams:
forward = True
if decoy_contams or target_contams:
reverse_fasta.main(fasta_file, forward, reverse, both, contam_path=contam_location)
def banned_file(self, fname):
"""False if fname in banned list."""
skip = False
for ban in self.banned_list:
if ban.lower() in fname.lower():
skip = True
return skip
def update_status_bar(self, _text):
"""Updates status bar with new text"""
self.status_bar.config(text=_text)
self.root.update_idletasks()
def quit_gui(self):
"""Quits the GUI application."""
self.logout() # close the FTP connection
self.update_defaults()
self.root.withdraw()
self.root.update_idletasks()
self.root.destroy()
sys.exit()
# Main Create GUI Function
def create_gui(self):
"""Creates the main GUI window and starts the event loop."""
self.root = Tk()
self.root.title("Ensembl Reference Proteome Downloader")
self.root.geometry("1250x700+250+150")
self.root.minsize(1250, 650)
# Check boxes and Import button Frame
## Main Frame
## option_frame = LabelFrame(self.root, text="Options")
option_frame = Frame(self.root)
option_frame.pack(side=TOP, padx=1, pady=5)
# Search Window
## Main Frame
search_window_frame = LabelFrame(option_frame, text="Filters:")
search_window_frame.pack(side=TOP, fill=BOTH, expand=YES, padx=0, pady=5)
# Create search bars/buttons
species_frame = Frame(search_window_frame)
species_frame.pack(fill=X, padx=5, pady=5)
species_label = Label(species_frame, text="Species Name:")
species_label.pack(side=LEFT, padx=5, pady=5)
self.search_species = Entry(species_frame)
self.search_species.pack(side=RIGHT, fill=X, expand=YES, padx=5, pady=5)
tax_frame = Frame(search_window_frame)
tax_frame.pack(fill=X, padx=5, pady=5)
tax_label = Label(tax_frame, text="Taxonomy ID:")
tax_label.pack(side=LEFT, padx=5, pady=5)
self.search_tax = Entry(tax_frame)
self.search_tax.pack(side=RIGHT, fill=X, expand=YES, padx=5, pady=5)
## Show filtered list button and reset filters button
filter_button = Button(search_window_frame, text="Show Filtered List", command=self.get_filtered_proteome_list)
filter_button.pack(side=LEFT, padx=10, pady=10)
clear_button = Button(search_window_frame, text="Reset Filters", command=self.reset_filters)
clear_button.pack(side=RIGHT, padx=10, pady=10)
# Additional Database Processing Frame
## Main Frame
rev_frame = LabelFrame(option_frame, text="Create Additional Databases:")
rev_frame.pack(fill=BOTH, expand=YES, padx=1, pady=5)
# options as check boxes
self.reverse_contams = CheckBoxes(rev_frame, ["Target+Decoy w/Contams", "Target w/Contams"])
self.reverse_contams.pack(side = LEFT, fill=X, padx=5, pady=5)
# option to change the contams database
contams_frame = Frame(option_frame)
contams_frame.pack(fill=BOTH, expand=YES, padx=10, pady=5)
self.contams_label = Label(contams_frame, text=os.path.split(self.contams_database)[1])
self.contams_label.pack(side=LEFT, padx=5, pady=5)
contams_button = Button(contams_frame, text="Change Contaminants Database", command=self.browse_contams)
contams_button.pack(side=LEFT, padx=5, pady=5)
# Entry mover-thingy Frame
## Main Frame
entry_frame = LabelFrame(self.root, text="Ensembl Databases")
entry_frame.pack(side=TOP, fill=BOTH, expand=YES, padx=5, pady=5)
## Left Window
left_tree_frame = LabelFrame(entry_frame, text="Ensembl Proteomes")
left_tree_frame.pack(fill=BOTH, expand=YES, side=LEFT, padx=5, pady=10)
# Create TreeView
self.tree_left = Treeview(left_tree_frame, columns=self.headers, show="headings")
self.tree_left.pack(fill=BOTH, expand=YES, side=LEFT, padx=5, pady=5)
for col in self.headers:
if col in ["TAX ID"]:
self.tree_left.heading(col, text=col.title(),
command=lambda col_=col: self.sort_num_column(self.tree_left, col_))
self.tree_left.column(col, minwidth=25, width=100, stretch=NO, anchor=E)
else:
self.tree_left.heading(col, text=col.title(), anchor=W,
command=lambda col_=col: self.sort_text_column(self.tree_left, col_))
self.tree_left.column(col, minwidth=25, width=150, stretch=NO)
# Add scrollbars to the TreeView
left_scroll_Y = Scrollbar(left_tree_frame, orient=VERTICAL)
left_scroll_Y.pack(side=RIGHT, fill=Y)
left_scroll_X = Scrollbar(self.tree_left, orient=HORIZONTAL)
left_scroll_X.pack(side=BOTTOM, fill=X)
self.tree_left.config(yscrollcommand=left_scroll_Y.set, xscrollcommand=left_scroll_X.set)
left_scroll_Y.config(command = self.tree_left.yview)
left_scroll_X.config(command = self.tree_left.xview)
## Menu Buttons
button_frame = LabelFrame(entry_frame, text="Menu Buttons")
button_frame.pack(side=LEFT)
# Set button attributes
button_names = ["Add Proteome(s)", "Drop Proteome(s)",
"Save Default Species", "Load Default Species",
"Download", "Quit"]
button_commands = [self.copy_to_right, self.drop_from_right,
self.save_defaults, self.select_defaults_and_load,
self.download_databases, self.quit_gui]
btn_width = 18
# Create buttons
for btn_name, btn_command in zip(button_names, button_commands):
button = Button(button_frame, text=btn_name,
command=btn_command)
button.pack()
button.config(width=btn_width)
## Right Window
right_tree_frame = LabelFrame(entry_frame, text="Selected Proteomes")
right_tree_frame.pack(fill=BOTH, expand=YES, side=RIGHT, padx=5, pady=10)
self.tree_right = Treeview(right_tree_frame, columns=self.headers, show="headings")
self.tree_right.pack(fill=BOTH, expand=YES, side=LEFT, padx=5, pady=5)
for col in self.headers:
if col in ["TAX ID"]:
self.tree_right.heading(col, text=col.title(),
command=lambda col_=col: self.sort_num_column(self.tree_right, col_))
self.tree_right.column(col, minwidth=25, width=100, stretch=NO, anchor=E)
else:
self.tree_right.heading(col, text=col.title(), anchor=W,
command=lambda col_=col: self.sort_text_column(self.tree_right, col_))
self.tree_right.column(col, minwidth=25, width=150, stretch=NO)
right_scroll_X = Scrollbar(self.tree_right, orient=HORIZONTAL)
right_scroll_X.pack(side=BOTTOM, fill=X)
right_scroll_Y = Scrollbar(right_tree_frame, orient=VERTICAL)
right_scroll_Y.pack(side=RIGHT, fill=Y)
self.tree_right.config(yscrollcommand=right_scroll_Y.set, xscrollcommand=right_scroll_X.set)
right_scroll_Y.config(command = self.tree_right.yview)
right_scroll_X.config(command = self.tree_right.xview)
# Miscellaneous Frame
misc_frame = Frame(self.root)
misc_frame.pack(side=BOTTOM, fill=X, padx=5, pady=5)
# Status Bar
status_frame = LabelFrame(misc_frame, text="Status")
status_frame.pack(side=TOP, fill=X, padx=5, pady=5)
self.status_bar = Label(status_frame, text="", relief=SUNKEN)
self.status_bar.pack(fill=X, padx=5, pady=5)
# open the FTP connection
self.login()
self.load_defaults() # initial import of defaults
self.create_raw_table()
self.parse_raw_table() # Create Entry objects
self.root.protocol("WM_DELETE_WINDOW", self.quit_gui) # Override window close event
self.get_filtered_proteome_list() # show the full left list to start
self.root.mainloop()
# Main Function
if __name__ == '__main__':
# Global Variables
FTP_URL = 'ftp.ensembl.org'
PROT_PATH = '/pub/current_fasta'
HEADERS = ["COMMON NAME", "LATIN NAME", "TAX ID", "ENSEMBL ASSEMBLY"]
BANNED = ["README", "CHECKSUMS", "abinitio.fa.gz"]
SCRIPT_LOCATION = os.path.dirname(os.path.realpath(__file__))
DEFAULT_CONTAMS = 'Thermo_contams.fasta'
# message to user
print('Starting Ensembl_proteome_manager.py - querying Ensembl...')
# Get HTML page from Ensembl for parsing
PARSE_URL = r'http://www.ensembl.org/info/about/species.html'
RESPONSE = urllib.request.urlopen(PARSE_URL)
DATA = RESPONSE.read()
TEXT = DATA.decode('utf-8')
# create the GUI object and start program
gui = GUI(FTP_URL, PROT_PATH, TEXT, HEADERS, BANNED, SCRIPT_LOCATION, DEFAULT_CONTAMS)
gui.create_gui()