parameterize genbank-url by NCBI Taxon ID

Parameterized the genbank-url script and subsequent calls by NCBI Taxon ID. This change generalizes the scripts and rules such that swapping in a different virus should only require a different Taxon ID. Co-authored-by: Jover Lee <joverlee521@gmail.com>
nextstrain · Nov 17, 2022 · 5d3281d · 5d3281d
1 parent f777113
commit 5d3281d
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 55 deletions.
diff --git a/ingest/bin/fetch-from-genbank b/ingest/bin/fetch-from-genbank
@@ -6,15 +6,16 @@
 set -euo pipefail
 
 bin="$(dirname "$0")"
+TAXID="${1:?NCBI taxon id is required.}"
 
 main() {
-    fetch | "$bin"/csv-to-ndjson
+    fetch "$TAXID" | "$bin"/csv-to-ndjson
 }
 
 fetch() {
-    curl "$("$bin"/genbank-url)" \
+    curl "$("$bin"/genbank-url --taxonid "$1")" \
         --fail --silent --show-error --http1.1 \
-        --header 'User-Agent: https://github.com/nextstrain/monkeypox (hello@nextstrain.org)'
+        --header 'User-Agent: https://github.com/nextstrain/dengue (hello@nextstrain.org)'
 }
 
 main "$@"
diff --git a/ingest/bin/genbank-url b/ingest/bin/genbank-url
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Generate URL to download all Monkeypox sequences and their curated metadata
+Generate URL to download all Pathogen sequences and their curated metadata
 from GenBank via NCBI Virus.
 
 The URL this program builds is based on the URL for SARS-CoV-2 constructed with
@@ -13,54 +13,91 @@ and observing the network activity at
     https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Dengue%20virus,%20taxid:12637
 """
 from urllib.parse import urlencode
+import argparse
 
-endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
-params = {
-    # Search criteria
-    'fq': [
-        '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein)
-        'VirusLineageId_ss:(10244)',                # NCBI Taxon id for Monkeypox
-    ],
-
-    # Unclear, but seems necessary.
-    'q': '*:*',
-
-    # Response format
-    'cmd': 'download',
-    'dlfmt': 'csv',
-    'fl': ','.join(
-        ':'.join(names) for names in [
-            # Pairs of (output column name, source data field).
-            ('genbank_accession',       'id'),
-            ('genbank_accession_rev',   'AccVer_s'),
-            ('database',                'SourceDB_s'),
-            ('strain',                  'Isolate_s'),
-            ('region',                  'Region_s'),
-            ('location',                'CountryFull_s'),
-            ('collected',               'CollectionDate_s'),
-            ('submitted',               'CreateDate_dt'),
-            ('length',                  'SLen_i'),
-            ('host',                    'Host_s'),
-            ('isolation_source',        'Isolation_csv'),
-            ('bioproject_accession',    'BioProject_s'),
-            ('biosample_accession',     'BioSample_s'),
-            ('sra_accession',           'SRALink_csv'),
-            ('title',                   'Definition_s'),
-            ('authors',                 'Authors_csv'),
-            ('submitting_organization', 'SubmitterAffilFull_s'),
-            ('publications',            'PubMed_csv'),
-            ('sequence',                'Nucleotide_seq'),
-        ]
-    ),
-
-    # Stable sort with GenBank accessions.
-    # Columns are source data fields, not our output columns.
-    'sort': 'id asc',
-
-    # This isn't Entrez, but include the same email parameter it requires just
-    # to be nice.
-    'email': 'hello@nextstrain.org',
-}
-query = urlencode(params, doseq = True, encoding = "utf-8")
-
-print(f"{endpoint}?{query}")
+
+def parse_args():
+    """
+    Define parse args
+    """
+    parser = argparse.ArgumentParser(
+        description="Given an NCBI taxon ID, generate URL to download "
+        "all viral sequences and their curated metadata from GenBank via NCBI Virus."
+    )
+    parser.add_argument(
+        "--taxonid",
+        help="NCBI Taxon ID.",
+        required=True
+    )
+    return parser.parse_args()
+
+
+def build_query_url(ncbi_id: str):
+    """
+    Generate URL to download all viral sequences and their curated metadata
+    from GenBank via NCBI Virus.
+    """
+    endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
+    params = {
+        # Search criteria
+        'fq': [
+            # Nucleotide sequences (as opposed to protein)
+            '{!tag=SeqType_s}SeqType_s:("Nucleotide")',
+            # NCBI Taxon id for virus
+            'VirusLineageId_ss:({ncbi_id})',
+        ],
+
+        # Unclear, but seems necessary.
+        'q': '*:*',
+
+        # Response format
+        'cmd': 'download',
+        'dlfmt': 'csv',
+        'fl': ','.join(
+            ':'.join(names) for names in [
+                # Pairs of (output column name, source data field).
+                ('genbank_accession',       'id'),
+                ('genbank_accession_rev',   'AccVer_s'),
+                ('database',                'SourceDB_s'),
+                ('strain',                  'Isolate_s'),
+                ('region',                  'Region_s'),
+                ('location',                'CountryFull_s'),
+                ('collected',               'CollectionDate_s'),
+                ('submitted',               'CreateDate_dt'),
+                ('length',                  'SLen_i'),
+                ('host',                    'Host_s'),
+                ('isolation_source',        'Isolation_csv'),
+                ('bioproject_accession',    'BioProject_s'),
+                ('biosample_accession',     'BioSample_s'),
+                ('sra_accession',           'SRALink_csv'),
+                ('title',                   'Definition_s'),
+                ('authors',                 'Authors_csv'),
+                ('submitting_organization', 'SubmitterAffilFull_s'),
+                ('publications',            'PubMed_csv'),
+                ('sequence',                'Nucleotide_seq'),
+            ]
+        ),
+
+        # Stable sort with GenBank accessions.
+        # Columns are source data fields, not our output columns.
+        'sort': 'id asc',
+
+        # This isn't Entrez, but include the same email parameter it requires just
+        # to be nice.
+        'email': 'hello@nextstrain.org',
+    }
+    query = urlencode(params, doseq=True, encoding="utf-8")
+
+    print(f"{endpoint}?{query}")
+
+
+def main():
+    """
+    Main method
+    """
+    args = parse_args()
+    build_query_url(args.taxonid)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -27,6 +27,7 @@ rule fetch_from_genbank:
     output:
         genbank_ndjson="data/genbank_{serotype}.ndjson",
     params:
+        serotype_tax_id=download_serotype,
         csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/csv-to-ndjson",
     shell:
         """
@@ -39,7 +40,7 @@ rule fetch_from_genbank:
           chmod 755 *
           cd ..
         fi
-        ./bin/fetch-from-genbank > {output.genbank_ndjson}
+        ./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}
         """