Skip to content

Commit

Permalink
WIP A/H7N6
Browse files Browse the repository at this point in the history
Very ad-hoc / temporary approach I used to generate FASTA files which
I could then use as inputs for avian-flu builds.

Avoids any use of the rethink db.

See <nextstrain/avian-flu#108> for the corresponding
avian-flu work.
  • Loading branch information
jameshadfield committed Dec 2, 2024
1 parent d632af0 commit dc313d6
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 9 deletions.
45 changes: 41 additions & 4 deletions vdb/avian_flu_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self, **kwargs):
('a / h7n1', ''): ('a', 'h7n1', None),
('a / h7n2', ''): ('a', 'h7n2', None),
('a / h7n3', ''): ('a', 'h7n3', None),
('a / h7n6', ''): ('a', 'h7n6', None),
('a / h7n7', ''): ('a', 'h7n7', None),
('a / h7n9', ''): ('a', 'h7n9', None),
('a / h9n2', ''): ('a', 'h9n2', None),
Expand All @@ -52,6 +53,7 @@ def __init__(self, **kwargs):
('b', 'victoria'): ('b', None, 'seasonal_vic'),
('b', 'yamagata'): ('b', None, 'seasonal_yam'),
('h5n1',''): ('a', 'h5n1', None),
('h7n6',''): ('a', 'h7n6', None),
('h7n9',''): ('a', 'h7n9', None),
('h9n2',''): ('a', 'h9n2', None)}
self.outgroups = {lineage: SeqIO.read('source-data/'+lineage+'_outgroup.gb', 'genbank') for lineage in ['H3N2', 'H1N1pdm', 'Vic', 'Yam']}
Expand Down Expand Up @@ -215,7 +217,7 @@ def format_viruses(self, documents, data_source, **kwargs):
doc['location'] = self.fix_location[doc['strain']]
self.format_place(doc, determine_location=True)
self.format_region(doc)
self.rethink_io.check_optional_attributes(doc, [])
# self.rethink_io.check_optional_attributes(doc, [])

def format_sequences(self, documents, **kwargs):
'''
Expand All @@ -232,7 +234,7 @@ def format_sequences(self, documents, **kwargs):
self.format_passage(doc, 'passage', 'passage_category')
self.format_passage(doc, 'virus_strain_passage', 'virus_strain_passage_category') #BP
self.format_passage(doc, 'serum_antigen_passage', 'serum_antigen_passage_category') #BP
self.rethink_io.check_optional_attributes(doc, [])
# self.rethink_io.check_optional_attributes(doc, [])
self.fix_casing(doc, args.data_source)
print("Names that need to be fixed")
for name in sorted(self.fix):
Expand Down Expand Up @@ -675,7 +677,7 @@ def determine_group_fields(self, v, patterns, **kwargs):
('gisaid_location', 'Location'), ('originating_lab', 'Originating_Lab'), ('Host_Age', 'Host_Age'),
('Host_Age_Unit', 'Host_Age_Unit'), ('gender', 'Host_Gender'), ('submission_date', 'Submission_Date'),
('submitting_lab', 'Submitting_Lab'), ('authors','Authors'), ('domestic_status','Domestic_Status'),
('PMID','PMID'), ('animal_health_status','Animal_Health_Status'), ('gisaid_clade','Clade')]
('PMID','PMID'), ('animal_health_status','Animal_Health_Status'), ('gisaid_clade','Clade'), ('pathogenicity', 'Pathogenicity')]
setattr(args, 'xls_fields_wanted', xls_fields_wanted)
elif (args.data_source == 'ird'):
virus_fasta_fields = {0:'strain', 4: 'vtype', 5: 'Subtype', 6:'collection_date', 8:'country', 10: 'host', 11:'h5_clade'}
Expand All @@ -689,4 +691,39 @@ def determine_group_fields(self, v, patterns, **kwargs):
if not os.path.isdir(args.path):
os.makedirs(args.path)
connVDB = flu_upload(**args.__dict__)
connVDB.upload(**args.__dict__)
(viruses, sequences) = connVDB.upload(**args.__dict__)

# sequences are an array of {'accession': 'EPI1895707', 'strain': 'A/ruddyturnstone/DelawareBay/281/2020', 'isolate_id': 'EP, ...
# viruses are an array of {'strain': 'A/quail/Aichi/6/2009', ... 'sequences': ['EPI266264', 'EPI266265', 'EPI266266', 'EPI266267', 'EPI266268', 'EPI266269', 'EPI266270', 'EPI266271'], etc
sequences_by_accession = {s['accession']: s for s in sequences} # may override if dups

handles = {locus: open(f"data/{locus}.fasta", 'w') for locus in {s['locus'] for s in sequences}}
print(f"Opened file handles for {', '.join(handles.keys())} loci")

# Looking at avian-flu / ingest we want the following '|' separated FASTA header fields:
header = ['strain', 'virus', 'accession', 'collection_date', 'region', 'country', 'division', 'location', 'host', 'domestic_status', 'subtype', 'originating_lab', 'submitting_lab', 'authors', 'PMID', 'gisaid_clade', 'h5_clade', 'pathogenicity']

from collections import defaultdict
seen = defaultdict(set)

for virus in viruses:
strain = virus['strain']
for accession in virus['sequences']:
sequence = sequences_by_accession.get(accession, None)
if sequence is None:
print(f"WARNING: missing accession {accession} for virus {virus['strain']}")
continue
locus = sequence['locus']
if locus in seen[strain]:
print(f"WARNING: skipping _virus_ {strain} _segment_ {locus} as already seen!")
continue
else:
seen[strain].add(locus)
fields = [str(virus[field]) if (field in virus and virus[field] is not None) else '?' for field in header]
fields[header.index('accession')] = accession
assert virus['subtype']=='h7n6'
handle = handles[sequence['locus']]
handle.write(">"+'|'.join(fields)+'\n')
handle.write(sequence['sequence'] + "\n")

print("\n\nmkdir -p ../avian-flu/ingest/fauna/data\ncp data/*.fasta ../avian-flu/ingest/fauna/data/")
8 changes: 4 additions & 4 deletions vdb/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ def add_virus_fields(self, v, host, country, **kwargs):
else :
v['country'] = country
v['virus'] = self.virus
v['timestamp'] = self.rethink_io.get_upload_timestamp()
v['virus_inclusion_date'] = self.rethink_io.get_upload_date()
# v['timestamp'] = self.rethink_io.get_upload_timestamp()
# v['virus_inclusion_date'] = self.rethink_io.get_upload_date()
v['sequences'] = []
v['number_sequences'] = 0
return v
Expand Down Expand Up @@ -208,8 +208,8 @@ def add_sequence_fields(self, v, locus, authors, title, source, url, public=True
if 'public' not in v and public is not None:
v['public'] = public
v['virus'] = self.virus
v['timestamp'] = self.rethink_io.get_upload_timestamp()
v['sequence_inclusion_date'] = self.rethink_io.get_upload_date()
# v['timestamp'] = self.rethink_io.get_upload_timestamp()
# v['sequence_inclusion_date'] = self.rethink_io.get_upload_date()
return v

def get_GIs(self, accessions, n_entrez=2500, **kwargs):
Expand Down
4 changes: 3 additions & 1 deletion vdb/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def upload(self, preview=False, **kwargs):
'''
format virus information, then upload to database
'''
self.connect(**kwargs)
# self.connect(**kwargs)
print("Uploading Viruses to VDB")
viruses, sequences = self.parse(**kwargs)
print('Formatting documents for upload')
Expand All @@ -75,6 +75,7 @@ def upload(self, preview=False, **kwargs):
#self.transfer_fields(viruses, sequences, self.virus_to_sequence_transfer_fields)
print("")
print("Upload Step")
preview = True # avoid uploading at all costs!
if not preview:
print("Uploading viruses to " + self.database + "." + self.viruses_table)
self.upload_documents(self.viruses_table, viruses, index='strain', **kwargs)
Expand All @@ -87,6 +88,7 @@ def upload(self, preview=False, **kwargs):
print(json.dumps(sequences[0], indent=1))
print("Remove \"--preview\" to upload documents")
print("Printed preview of viruses to be uploaded to make sure fields make sense")
return (viruses, sequences)

def connect(self, **kwargs):
if self.database not in self.uploadable_databases:
Expand Down

0 comments on commit dc313d6

Please sign in to comment.