From 4966936a75726f5efc5ee06012615406b28f88d9 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Nov 2023 16:27:42 -0800 Subject: [PATCH] fixup: NCBI Dataset field name transformations --- ingest/config/config.yaml | 23 ++++++++++- ingest/source-data/ncbi-dataset-field-map.tsv | 17 -------- .../snakemake_rules/fetch_sequences.smk | 40 +++---------------- 3 files changed, 26 insertions(+), 54 deletions(-) delete mode 100644 ingest/source-data/ncbi-dataset-field-map.tsv diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index fdf5cb4..927bd75 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -2,8 +2,27 @@ sources: ['genbank'] # Pathogen NCBI Taxonomy ID ncbi_taxon_id: '64320' -# Renames the NCBI dataset headers -ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv' +# The list of NCBI Datasets fields to include from NCBI Datasets output +# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields +# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# Note: the "accession" field MUST be provided to match with the sequences +ncbi_datasets_fields: + - accession + - sourcedb + - sra-accs + - isolate-lineage + - geo-region + - geo-location + - isolate-collection-date + - release-date + - update-date + - length + - host-name + - isolate-lineage-source + - biosample-acc + - submitter-names + - submitter-affiliation + - submitter-country # Params for the transform rule transform: diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv deleted file mode 100644 index 57b4f8c..0000000 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ /dev/null @@ -1,17 +0,0 @@ -# Maps the NCBI output TSV column names back to the NCBI mnemonics. -# This list should match the list in -# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics -key value -Accession accession-rev -Source database sourcedb -Isolate Lineage isolate-lineage -Geographic Region geo-region -Geographic Location geo-location -Isolate Collection date isolate-collection-date -Release date release-date -Update date update-date -Length length -Host Name host-name -SRA Accessions sra-accs -Submitter Names submitter-names -Submitter Affiliation submitter-affiliation diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 8d27193..2fef4b1 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -44,54 +44,24 @@ rule extract_ncbi_dataset_sequences: """ -def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: - """ - Return list of NCBI Dataset report field mnemonics for fields that we want - to parse out of the dataset report. The column names in the output TSV - are different from the mnemonics. - - See NCBI Dataset docs for full list of available fields and their column - names in the output: - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields - """ - fields = [ - "accession", - "sourcedb", - "isolate-lineage", - "geo-region", - "geo-location", - "isolate-collection-date", - "release-date", - "update-date", - "length", - "host-name", - "isolate-lineage-source", - "bioprojects", - "biosample-acc", - "sra-accs", - "submitter-names", - "submitter-affiliation", - ] - return ",".join(fields) - - rule format_ncbi_dataset_report: # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", - ncbi_field_map=config["ncbi_field_map"], output: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: - fields_to_include=_get_ncbi_dataset_field_mnemonics, + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), benchmark: "benchmarks/format_ncbi_dataset_report.txt" shell: """ dataformat tsv virus-genome \ --package {input.dataset_package} \ - --fields {params.fields_to_include:q} \ - | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ + --fields {params.ncbi_datasets_fields:q} \ + --elide-header \ + | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \ + | csvtk rename -t -f accession -n accession-rev \ | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv}