WIP A/H7N6

Very ad-hoc / temporary approach I used to generate FASTA files which I could then use as inputs for avian-flu builds. Avoids any use of the rethink db. See <nextstrain/avian-flu#108> for the corresponding avian-flu work.
nextstrain · Dec 2, 2024 · dc313d6 · dc313d6
1 parent d632af0
commit dc313d6
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 9 deletions.
diff --git a/vdb/avian_flu_upload.py b/vdb/avian_flu_upload.py
@@ -41,6 +41,7 @@ def __init__(self, **kwargs):
                     ('a / h7n1', ''): ('a', 'h7n1', None),
                     ('a / h7n2', ''): ('a', 'h7n2', None),
                     ('a / h7n3', ''): ('a', 'h7n3', None),
+                    ('a / h7n6', ''): ('a', 'h7n6', None),
                     ('a / h7n7', ''): ('a', 'h7n7', None),
                     ('a / h7n9', ''): ('a', 'h7n9', None),
                     ('a / h9n2', ''): ('a', 'h9n2', None),
@@ -52,6 +53,7 @@ def __init__(self, **kwargs):
                     ('b', 'victoria'): ('b', None, 'seasonal_vic'),
                     ('b', 'yamagata'): ('b', None, 'seasonal_yam'),
                     ('h5n1',''): ('a', 'h5n1', None),
+                    ('h7n6',''): ('a', 'h7n6', None),
                     ('h7n9',''): ('a', 'h7n9', None),
                     ('h9n2',''): ('a', 'h9n2', None)}
         self.outgroups = {lineage: SeqIO.read('source-data/'+lineage+'_outgroup.gb', 'genbank') for lineage in ['H3N2', 'H1N1pdm', 'Vic', 'Yam']}
@@ -215,7 +217,7 @@ def format_viruses(self, documents, data_source, **kwargs):
                     doc['location'] = self.fix_location[doc['strain']]
             self.format_place(doc, determine_location=True)
             self.format_region(doc)
-            self.rethink_io.check_optional_attributes(doc, [])
+            # self.rethink_io.check_optional_attributes(doc, [])
 
     def format_sequences(self, documents, **kwargs):
         '''
@@ -232,7 +234,7 @@ def format_sequences(self, documents, **kwargs):
             self.format_passage(doc, 'passage', 'passage_category')
             self.format_passage(doc, 'virus_strain_passage', 'virus_strain_passage_category') #BP
             self.format_passage(doc, 'serum_antigen_passage', 'serum_antigen_passage_category') #BP
-            self.rethink_io.check_optional_attributes(doc, [])
+            # self.rethink_io.check_optional_attributes(doc, [])
             self.fix_casing(doc, args.data_source)
         print("Names that need to be fixed")
         for name in sorted(self.fix):
@@ -675,7 +677,7 @@ def determine_group_fields(self, v, patterns, **kwargs):
                                  ('gisaid_location', 'Location'), ('originating_lab', 'Originating_Lab'), ('Host_Age', 'Host_Age'),
                                  ('Host_Age_Unit', 'Host_Age_Unit'), ('gender', 'Host_Gender'), ('submission_date', 'Submission_Date'),
                                  ('submitting_lab', 'Submitting_Lab'), ('authors','Authors'), ('domestic_status','Domestic_Status'),
-                                 ('PMID','PMID'), ('animal_health_status','Animal_Health_Status'), ('gisaid_clade','Clade')]
+                                 ('PMID','PMID'), ('animal_health_status','Animal_Health_Status'), ('gisaid_clade','Clade'), ('pathogenicity', 'Pathogenicity')]
         setattr(args, 'xls_fields_wanted', xls_fields_wanted)
     elif (args.data_source == 'ird'):
         virus_fasta_fields = {0:'strain', 4: 'vtype', 5: 'Subtype', 6:'collection_date', 8:'country', 10: 'host', 11:'h5_clade'}
@@ -689,4 +691,39 @@ def determine_group_fields(self, v, patterns, **kwargs):
     if not os.path.isdir(args.path):
         os.makedirs(args.path)
     connVDB = flu_upload(**args.__dict__)
-    connVDB.upload(**args.__dict__)
+    (viruses, sequences) = connVDB.upload(**args.__dict__)
+
+    # sequences are an array of {'accession': 'EPI1895707', 'strain': 'A/ruddyturnstone/DelawareBay/281/2020', 'isolate_id': 'EP, ...
+    # viruses are an array of {'strain': 'A/quail/Aichi/6/2009', ... 'sequences': ['EPI266264', 'EPI266265', 'EPI266266', 'EPI266267', 'EPI266268', 'EPI266269', 'EPI266270', 'EPI266271'], etc
+    sequences_by_accession = {s['accession']: s for s in sequences} # may override if dups
+
+    handles = {locus: open(f"data/{locus}.fasta", 'w') for locus in {s['locus'] for s in sequences}}
+    print(f"Opened file handles for {', '.join(handles.keys())} loci")
+
+    # Looking at avian-flu / ingest we want the following '|' separated FASTA header fields:
+    header = ['strain', 'virus', 'accession', 'collection_date', 'region', 'country', 'division', 'location', 'host', 'domestic_status', 'subtype', 'originating_lab', 'submitting_lab', 'authors', 'PMID', 'gisaid_clade', 'h5_clade', 'pathogenicity']
+
+    from collections import defaultdict
+    seen = defaultdict(set)
+
+    for virus in viruses:
+        strain = virus['strain']
+        for accession in virus['sequences']:
+            sequence = sequences_by_accession.get(accession, None)
+            if sequence is None:
+                print(f"WARNING: missing accession {accession} for virus {virus['strain']}")
+                continue
+            locus = sequence['locus']
+            if locus in seen[strain]:
+                print(f"WARNING: skipping _virus_ {strain} _segment_ {locus} as already seen!")
+                continue
+            else:
+                seen[strain].add(locus)
+            fields = [str(virus[field]) if (field in virus and virus[field] is not None) else '?' for field in header]
+            fields[header.index('accession')] = accession
+            assert virus['subtype']=='h7n6'
+            handle = handles[sequence['locus']]
+            handle.write(">"+'|'.join(fields)+'\n')
+            handle.write(sequence['sequence'] + "\n")
+
+    print("\n\nmkdir -p ../avian-flu/ingest/fauna/data\ncp data/*.fasta ../avian-flu/ingest/fauna/data/")
diff --git a/vdb/parse.py b/vdb/parse.py
@@ -170,8 +170,8 @@ def add_virus_fields(self, v, host, country, **kwargs):
             else :
                 v['country'] = country
         v['virus'] = self.virus
-        v['timestamp'] = self.rethink_io.get_upload_timestamp()
-        v['virus_inclusion_date'] = self.rethink_io.get_upload_date()
+        # v['timestamp'] = self.rethink_io.get_upload_timestamp()
+        # v['virus_inclusion_date'] = self.rethink_io.get_upload_date()
         v['sequences'] = []
         v['number_sequences'] = 0
         return v
@@ -208,8 +208,8 @@ def add_sequence_fields(self, v, locus, authors, title, source, url, public=True
         if 'public' not in v and public is not None:
             v['public'] = public
         v['virus'] = self.virus
-        v['timestamp'] = self.rethink_io.get_upload_timestamp()
-        v['sequence_inclusion_date'] = self.rethink_io.get_upload_date()
+        # v['timestamp'] = self.rethink_io.get_upload_timestamp()
+        # v['sequence_inclusion_date'] = self.rethink_io.get_upload_date()
         return v
 
     def get_GIs(self, accessions, n_entrez=2500, **kwargs):

diff --git a/vdb/upload.py b/vdb/upload.py
@@ -55,7 +55,7 @@ def upload(self, preview=False, **kwargs):
         '''
         format virus information, then upload to database
         '''
-        self.connect(**kwargs)
+        # self.connect(**kwargs)
         print("Uploading Viruses to VDB")
         viruses, sequences = self.parse(**kwargs)
         print('Formatting documents for upload')
@@ -75,6 +75,7 @@ def upload(self, preview=False, **kwargs):
         #self.transfer_fields(viruses, sequences, self.virus_to_sequence_transfer_fields)
         print("")
         print("Upload Step")
+        preview = True # avoid uploading at all costs!
         if not preview:
             print("Uploading viruses to " + self.database + "." + self.viruses_table)
             self.upload_documents(self.viruses_table, viruses, index='strain', **kwargs)
@@ -87,6 +88,7 @@ def upload(self, preview=False, **kwargs):
             print(json.dumps(sequences[0], indent=1))
             print("Remove \"--preview\" to upload documents")
             print("Printed preview of viruses to be uploaded to make sure fields make sense")
+        return (viruses, sequences)
 
     def connect(self, **kwargs):
         if self.database not in self.uploadable_databases: