From 5d3281dd1a4f434a46bf16e2ea951f510c0cec8e Mon Sep 17 00:00:00 2001 From: j23414 Date: Thu, 17 Nov 2022 11:29:55 -0800 Subject: [PATCH] parameterize genbank-url by NCBI Taxon ID Parameterized the genbank-url script and subsequent calls by NCBI Taxon ID. This change generalizes the scripts and rules such that swapping in a different virus should only require a different Taxon ID. Co-authored-by: Jover Lee --- ingest/bin/fetch-from-genbank | 7 +- ingest/bin/genbank-url | 139 +++++++++++------- .../snakemake_rules/fetch_sequences.smk | 3 +- 3 files changed, 94 insertions(+), 55 deletions(-) diff --git a/ingest/bin/fetch-from-genbank b/ingest/bin/fetch-from-genbank index 2a7ee3b0..ac6ed8bc 100755 --- a/ingest/bin/fetch-from-genbank +++ b/ingest/bin/fetch-from-genbank @@ -6,15 +6,16 @@ set -euo pipefail bin="$(dirname "$0")" +TAXID="${1:?NCBI taxon id is required.}" main() { - fetch | "$bin"/csv-to-ndjson + fetch "$TAXID" | "$bin"/csv-to-ndjson } fetch() { - curl "$("$bin"/genbank-url)" \ + curl "$("$bin"/genbank-url --taxonid "$1")" \ --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/monkeypox (hello@nextstrain.org)' + --header 'User-Agent: https://github.com/nextstrain/dengue (hello@nextstrain.org)' } main "$@" diff --git a/ingest/bin/genbank-url b/ingest/bin/genbank-url index fc218dfa..3308acd6 100755 --- a/ingest/bin/genbank-url +++ b/ingest/bin/genbank-url @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Generate URL to download all Monkeypox sequences and their curated metadata +Generate URL to download all Pathogen sequences and their curated metadata from GenBank via NCBI Virus. The URL this program builds is based on the URL for SARS-CoV-2 constructed with @@ -13,54 +13,91 @@ and observing the network activity at https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Dengue%20virus,%20taxid:12637 """ from urllib.parse import urlencode +import argparse -endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/" -params = { - # Search criteria - 'fq': [ - '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein) - 'VirusLineageId_ss:(10244)', # NCBI Taxon id for Monkeypox - ], - - # Unclear, but seems necessary. - 'q': '*:*', - - # Response format - 'cmd': 'download', - 'dlfmt': 'csv', - 'fl': ','.join( - ':'.join(names) for names in [ - # Pairs of (output column name, source data field). - ('genbank_accession', 'id'), - ('genbank_accession_rev', 'AccVer_s'), - ('database', 'SourceDB_s'), - ('strain', 'Isolate_s'), - ('region', 'Region_s'), - ('location', 'CountryFull_s'), - ('collected', 'CollectionDate_s'), - ('submitted', 'CreateDate_dt'), - ('length', 'SLen_i'), - ('host', 'Host_s'), - ('isolation_source', 'Isolation_csv'), - ('bioproject_accession', 'BioProject_s'), - ('biosample_accession', 'BioSample_s'), - ('sra_accession', 'SRALink_csv'), - ('title', 'Definition_s'), - ('authors', 'Authors_csv'), - ('submitting_organization', 'SubmitterAffilFull_s'), - ('publications', 'PubMed_csv'), - ('sequence', 'Nucleotide_seq'), - ] - ), - - # Stable sort with GenBank accessions. - # Columns are source data fields, not our output columns. - 'sort': 'id asc', - - # This isn't Entrez, but include the same email parameter it requires just - # to be nice. - 'email': 'hello@nextstrain.org', -} -query = urlencode(params, doseq = True, encoding = "utf-8") - -print(f"{endpoint}?{query}") + +def parse_args(): + """ + Define parse args + """ + parser = argparse.ArgumentParser( + description="Given an NCBI taxon ID, generate URL to download " + "all viral sequences and their curated metadata from GenBank via NCBI Virus." + ) + parser.add_argument( + "--taxonid", + help="NCBI Taxon ID.", + required=True + ) + return parser.parse_args() + + +def build_query_url(ncbi_id: str): + """ + Generate URL to download all viral sequences and their curated metadata + from GenBank via NCBI Virus. + """ + endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/" + params = { + # Search criteria + 'fq': [ + # Nucleotide sequences (as opposed to protein) + '{!tag=SeqType_s}SeqType_s:("Nucleotide")', + # NCBI Taxon id for virus + 'VirusLineageId_ss:({ncbi_id})', + ], + + # Unclear, but seems necessary. + 'q': '*:*', + + # Response format + 'cmd': 'download', + 'dlfmt': 'csv', + 'fl': ','.join( + ':'.join(names) for names in [ + # Pairs of (output column name, source data field). + ('genbank_accession', 'id'), + ('genbank_accession_rev', 'AccVer_s'), + ('database', 'SourceDB_s'), + ('strain', 'Isolate_s'), + ('region', 'Region_s'), + ('location', 'CountryFull_s'), + ('collected', 'CollectionDate_s'), + ('submitted', 'CreateDate_dt'), + ('length', 'SLen_i'), + ('host', 'Host_s'), + ('isolation_source', 'Isolation_csv'), + ('bioproject_accession', 'BioProject_s'), + ('biosample_accession', 'BioSample_s'), + ('sra_accession', 'SRALink_csv'), + ('title', 'Definition_s'), + ('authors', 'Authors_csv'), + ('submitting_organization', 'SubmitterAffilFull_s'), + ('publications', 'PubMed_csv'), + ('sequence', 'Nucleotide_seq'), + ] + ), + + # Stable sort with GenBank accessions. + # Columns are source data fields, not our output columns. + 'sort': 'id asc', + + # This isn't Entrez, but include the same email parameter it requires just + # to be nice. + 'email': 'hello@nextstrain.org', + } + query = urlencode(params, doseq=True, encoding="utf-8") + + print(f"{endpoint}?{query}") + + +def main(): + """ + Main method + """ + args = parse_args() + build_query_url(args.taxonid) + + +if __name__ == "__main__": + main() diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 7efb7c3f..d1c2173f 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -27,6 +27,7 @@ rule fetch_from_genbank: output: genbank_ndjson="data/genbank_{serotype}.ndjson", params: + serotype_tax_id=download_serotype, csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/csv-to-ndjson", shell: """ @@ -39,7 +40,7 @@ rule fetch_from_genbank: chmod 755 * cd .. fi - ./bin/fetch-from-genbank > {output.genbank_ndjson} + ./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson} """