From 5cd9af6def78ccd8469f48502fb84e10452f1470 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 7 Feb 2023 15:38:52 -0800
Subject: [PATCH] Parameterize ncbi_taxon_id in fetch_sequences

Historically the NCBI Taxon ID has been hardcoded in ingest/bin/genbank-url
(e.g. 'VirusLineageId_ss:(10244)') and this commit generalizes the script to
take an NCBI Taxon ID which could be defined at the Snakemake rule level.

Even if we move to use NCBI datasets, parameterizing the NCBI Taxon ID would
feed nicely into the datasets command:

f"datasets download virus genome taxon {params.ncbi_id} --no-progressbar --filename {output.dataset_package}
---
 ingest/bin/fetch-from-genbank                 |   6 +-
 ingest/bin/genbank-url                        | 116 +++++++++++-------
 .../snakemake_rules/fetch_sequences.smk       |   2 +-
 3 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/ingest/bin/fetch-from-genbank b/ingest/bin/fetch-from-genbank
index 2a7ee3b0..2ecb6768 100755
--- a/ingest/bin/fetch-from-genbank
+++ b/ingest/bin/fetch-from-genbank
@@ -7,12 +7,14 @@ set -euo pipefail
 
 bin="$(dirname "$0")"
 
+
 main() {
-    fetch | "$bin"/csv-to-ndjson
+    local ncbi_taxon_id="${1:?NCBI taxon id is required.}"
+    fetch "$ncbi_taxon_id" | "$bin"/csv-to-ndjson
 }
 
 fetch() {
-    curl "$("$bin"/genbank-url)" \
+    curl "$("$bin"/genbank-url --ncbi-taxon-id "$1")" \
         --fail --silent --show-error --http1.1 \
         --header 'User-Agent: https://github.com/nextstrain/monkeypox (hello@nextstrain.org)'
 }
diff --git a/ingest/bin/genbank-url b/ingest/bin/genbank-url
index 29108419..8c61616b 100755
--- a/ingest/bin/genbank-url
+++ b/ingest/bin/genbank-url
@@ -12,54 +12,80 @@ and observing the network activity at
     https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Monkeypox%20virus,%20taxid:10244
 """
 from urllib.parse import urlencode
+import argparse
 
-endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
-params = {
-    # Search criteria
-    'fq': [
-        '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein)
-        'VirusLineageId_ss:(10244)',                # NCBI Taxon id for Monkeypox
-    ],
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Given an NCBI taxon ID, generate URL to download "
+        "all viral sequences and their curated metadata from GenBank via NCBI Virus."
+    )
+    parser.add_argument(
+        "--ncbi-taxon-id",
+        help="NCBI Taxon ID.",
+        default="10244",
+        required=True
+    )
+    return parser.parse_args()
 
-    # Unclear, but seems necessary.
-    'q': '*:*',
+def build_query_url(ncbi_taxon_id: str):
+    """
+    Generate URL to download all viral sequences and their curated metadata
+    from GenBank via NCBI Virus.
+    """
+    endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
+    params = {
+        # Search criteria
+        'fq': [
+            '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein)
+            f'VirusLineageId_ss:({ncbi_taxon_id})',                # NCBI Taxon id for Monkeypox
+        ],
 
-    # Response format
-    'cmd': 'download',
-    'dlfmt': 'csv',
-    'fl': ','.join(
-        ':'.join(names) for names in [
-            # Pairs of (output column name, source data field).
-            ('genbank_accession',       'id'),
-            ('genbank_accession_rev',   'AccVer_s'),
-            ('database',                'SourceDB_s'),
-            ('strain',                  'Isolate_s'),
-            ('region',                  'Region_s'),
-            ('location',                'CountryFull_s'),
-            ('collected',               'CollectionDate_s'),
-            ('submitted',               'CreateDate_dt'),
-            ('length',                  'SLen_i'),
-            ('host',                    'Host_s'),
-            ('isolation_source',        'Isolation_csv'),
-            ('bioproject_accession',    'BioProject_s'),
-            ('biosample_accession',     'BioSample_s'),
-            ('sra_accession',           'SRALink_csv'),
-            ('title',                   'Definition_s'),
-            ('authors',                 'Authors_csv'),
-            ('submitting_organization', 'SubmitterAffilFull_s'),
-            ('publications',            'PubMed_csv'),
-            ('sequence',                'Nucleotide_seq'),
-        ]
-    ),
+        # Unclear, but seems necessary.
+        'q': '*:*',
 
-    # Stable sort with GenBank accessions.
-    # Columns are source data fields, not our output columns.
-    'sort': 'id asc',
+        # Response format
+        'cmd': 'download',
+        'dlfmt': 'csv',
+        'fl': ','.join(
+            ':'.join(names) for names in [
+                # Pairs of (output column name, source data field).
+                ('genbank_accession',       'id'),
+                ('genbank_accession_rev',   'AccVer_s'),
+                ('database',                'SourceDB_s'),
+                ('strain',                  'Isolate_s'),
+                ('region',                  'Region_s'),
+                ('location',                'CountryFull_s'),
+                ('collected',               'CollectionDate_s'),
+                ('submitted',               'CreateDate_dt'),
+                ('length',                  'SLen_i'),
+                ('host',                    'Host_s'),
+                ('isolation_source',        'Isolation_csv'),
+                ('bioproject_accession',    'BioProject_s'),
+                ('biosample_accession',     'BioSample_s'),
+                ('sra_accession',           'SRALink_csv'),
+                ('title',                   'Definition_s'),
+                ('authors',                 'Authors_csv'),
+                ('submitting_organization', 'SubmitterAffilFull_s'),
+                ('publications',            'PubMed_csv'),
+                ('sequence',                'Nucleotide_seq'),
+            ]
+        ),
 
-    # This isn't Entrez, but include the same email parameter it requires just
-    # to be nice.
-    'email': 'hello@nextstrain.org',
-}
-query = urlencode(params, doseq = True, encoding = "utf-8")
+        # Stable sort with GenBank accessions.
+        # Columns are source data fields, not our output columns.
+        'sort': 'id asc',
 
-print(f"{endpoint}?{query}")
+        # This isn't Entrez, but include the same email parameter it requires just
+        # to be nice.
+        'email': 'hello@nextstrain.org',
+    }
+    query = urlencode(params, doseq = True, encoding = "utf-8")
+
+    print(f"{endpoint}?{query}")
+
+def main():
+    args = parse_args()
+    build_query_url(args.ncbi_taxon_id)
+
+if __name__ == '__main__':
+    main()
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
index ebad1c26..9f330062 100644
--- a/ingest/workflow/snakemake_rules/fetch_sequences.smk
+++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -19,7 +19,7 @@ rule fetch_from_genbank:
         genbank_ndjson="data/genbank.ndjson",
     shell:
         """
-        ./bin/fetch-from-genbank > {output.genbank_ndjson}
+        ./bin/fetch-from-genbank 10244 > {output.genbank_ndjson}
         """