From 4966936a75726f5efc5ee06012615406b28f88d9 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 29 Nov 2023 16:27:42 -0800
Subject: [PATCH] fixup: NCBI Dataset field name transformations

---
 ingest/config/config.yaml                     | 23 ++++++++++-
 ingest/source-data/ncbi-dataset-field-map.tsv | 17 --------
 .../snakemake_rules/fetch_sequences.smk       | 40 +++----------------
 3 files changed, 26 insertions(+), 54 deletions(-)
 delete mode 100644 ingest/source-data/ncbi-dataset-field-map.tsv

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index fdf5cb4..927bd75 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -2,8 +2,27 @@
 sources: ['genbank']
 # Pathogen NCBI Taxonomy ID
 ncbi_taxon_id: '64320'
-# Renames the NCBI dataset headers
-ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - sra-accs
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - biosample-acc
+  - submitter-names
+  - submitter-affiliation
+  - submitter-country
 
 # Params for the transform rule
 transform:
diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
deleted file mode 100644
index 57b4f8c..0000000
--- a/ingest/source-data/ncbi-dataset-field-map.tsv
+++ /dev/null
@@ -1,17 +0,0 @@
-# Maps the NCBI output TSV column names back to the NCBI mnemonics.
-# This list should match the list in
-# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics
-key	value
-Accession	accession-rev
-Source database	sourcedb
-Isolate Lineage	isolate-lineage
-Geographic Region	geo-region
-Geographic Location	geo-location
-Isolate Collection date	isolate-collection-date
-Release date	release-date
-Update date	update-date
-Length	length
-Host Name	host-name
-SRA Accessions	sra-accs
-Submitter Names	submitter-names
-Submitter Affiliation	submitter-affiliation
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
index 8d27193..2fef4b1 100644
--- a/ingest/workflow/snakemake_rules/fetch_sequences.smk
+++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -44,54 +44,24 @@ rule extract_ncbi_dataset_sequences:
         """
 
 
-def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
-    """
-    Return list of NCBI Dataset report field mnemonics for fields that we want
-    to parse out of the dataset report. The column names in the output TSV
-    are different from the mnemonics.
-
-    See NCBI Dataset docs for full list of available fields and their column
-    names in the output:
-    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-    """
-    fields = [
-        "accession",
-        "sourcedb",
-        "isolate-lineage",
-        "geo-region",
-        "geo-location",
-        "isolate-collection-date",
-        "release-date",
-        "update-date",
-        "length",
-        "host-name",
-        "isolate-lineage-source",
-        "bioprojects",
-        "biosample-acc",
-        "sra-accs",
-        "submitter-names",
-        "submitter-affiliation",
-    ]
-    return ",".join(fields)
-
-
 rule format_ncbi_dataset_report:
     # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
-        ncbi_field_map=config["ncbi_field_map"],
     output:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
-        fields_to_include=_get_ncbi_dataset_field_mnemonics,
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
     shell:
         """
         dataformat tsv virus-genome \
             --package {input.dataset_package} \
-            --fields {params.fields_to_include:q} \
-            | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
+            | csvtk rename -t -f accession -n accession-rev \
             | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
             | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}