ingest: Switch to NCBI Datasets CLI to fetch data

Replace our custom fetch scripts that uses the NCBI Virus API with the NCBI Datasets CLI commands. NCBI datasets downloads a virus dataset ZIP file that includes a metadata JSON Lines file and a sequences FASTA file. To maintain a record of the single NDJSON file on S3, extract the sequences FASTA file and format the metadata into a TSV file that are parsed into a single NDJSON file using `augur curate passthru`. The metadata TSV is created using the NCBI `dataformat` command so that we do not have to parse the nested JSON lines files ourselves and header fields are renamed to match the previous fields we used for NCBI Virus. The NDJSON file created here no longer includes equivalent fields for "title" or "publication".
nextstrain · Sep 11, 2023 · 82ace30 · 82ace30
1 parent b766498
commit 82ace30
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 4 deletions.
diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
@@ -0,0 +1,17 @@
+key	value
+Accession	genbank_accession_rev
+Source database	database
+Isolate Lineage	strain
+Geographic Region	region
+Geographic Location	location
+Isolate Collection date	collected
+Release date	submitted
+Update date	updated
+Length	length
+Host Name	host
+Isolate Lineage source	isolation_source
+BioProjects	bioproject_accession
+BioSample accession	biosample_accession
+SRA Accessions	sra_accession
+Submitter Names	authors
+Submitter Affiliation	submitting_organization
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -14,13 +14,109 @@ Produces final output as
 """
 
 
-rule fetch_from_genbank:
+rule fetch_ncbi_dataset_package:
     output:
-        genbank_ndjson="data/genbank.ndjson",
-    retries: 5  # Requires snakemake 7.7.0 or later
+        dataset_package = temp("data/ncbi_dataset.zip")
+    retries: 5 # Requires snakemake 7.7.0 or later
+    benchmark:
+        "benchmarks/fetch_ncbi_dataset_package.txt"
     shell:
         """
-        ./vendored/fetch-from-ncbi-virus 10244 nextstrain/monkeypox > {output.genbank_ndjson}
+        datasets download virus genome taxon 10244 \
+            --no-progressbar \
+            --filename {output.dataset_package}
+        """
+
+
+rule extract_ncbi_dataset_sequences:
+    input:
+        dataset_package = "data/ncbi_dataset.zip"
+    output:
+        ncbi_dataset_sequences = temp("data/ncbi_dataset_sequences.fasta")
+    benchmark:
+        "benchmarks/extract_ncbi_dataset_sequences.txt"
+    shell:
+        """
+        unzip -jp {input.dataset_package} \
+            ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences}
+        """
+
+
+def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
+    """
+    Return list of NCBI Dataset report field mnemonics for fields that we want
+    to parse out of the dataset report. The column names in the output TSV
+    are different from the mnemonics.
+
+    See NCBI Dataset docs for full list of available fields and their column
+    names in the output:
+    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+    """
+    fields = [
+        "accession",
+        "sourcedb",
+        "isolate-lineage",
+        "geo-region",
+        "geo-location",
+        "isolate-collection-date",
+        "release-date",
+        "update-date",
+        "length",
+        "host-name",
+        "isolate-lineage-source",
+        "bioprojects",
+        "biosample-acc",
+        "sra-accs",
+        "submitter-names",
+        "submitter-affiliation",
+    ]
+    return ",".join(fields)
+
+
+rule format_ncbi_dataset_report:
+    # Formats the headers to be the same as before we used NCBI Datasets
+    # The only fields we do not have equivalents for are "title" and "publications"
+    input:
+        dataset_package = "data/ncbi_dataset.zip",
+        ncbi_field_map = "source-data/ncbi-dataset-field-map.tsv"
+    output:
+        ncbi_dataset_tsv = temp("data/ncbi_dataset_report.tsv")
+    params:
+        fields_to_include = _get_ncbi_dataset_field_mnemonics
+    benchmark:
+        "benchmarks/format_ncbi_dataset_report.txt"
+    shell:
+        """
+        dataformat tsv virus-genome \
+            --package {input.dataset_package} \
+            --fields {params.fields_to_include:q} \
+            | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
+            | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \
+            | tsv-select -H -f genbank_accession --rest last \
+            > {output.ncbi_dataset_tsv}
+        """
+
+
+rule format_ncbi_datasets_ndjson:
+    input:
+        ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta",
+        ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv",
+    output:
+        ndjson = "data/genbank.ndjson",
+    log:
+        "logs/format_ncbi_datasets_ndjson.txt"
+    benchmark:
+        "benchmarks/format_ncbi_datasets_ndjson.txt"
+    shell:
+        """
+        augur curate passthru \
+            --metadata {input.ncbi_dataset_tsv} \
+            --fasta {input.ncbi_dataset_sequences} \
+            --seq-id-column genbank_accession_rev \
+            --seq-field sequence \
+            --unmatched-reporting warn \
+            --duplicate-reporting warn \
+            2> {log} > {output.ndjson}
         """