Skip to content

Commit

Permalink
fixup: NCBI Dataset field name transformations
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Dec 5, 2023
1 parent 870c938 commit 9605bf5
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 137 deletions.
23 changes: 21 additions & 2 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,27 @@
sources: ['genbank']
# Pathogen NCBI Taxonomy ID
ncbi_taxon_id: '64320'
# Renames the NCBI dataset headers
ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
# Note: the "accession" field MUST be provided to match with the sequences
ncbi_datasets_fields:
- accession
- sourcedb
- sra-accs
- isolate-lineage
- geo-region
- geo-location
- isolate-collection-date
- release-date
- update-date
- length
- host-name
- isolate-lineage-source
- biosample-acc
- submitter-names
- submitter-affiliation
- submitter-country

# Params for the transform rule
transform:
Expand Down
17 changes: 0 additions & 17 deletions ingest/source-data/ncbi-dataset-field-map.tsv

This file was deleted.

43 changes: 5 additions & 38 deletions ingest/workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -44,56 +44,23 @@ rule extract_ncbi_dataset_sequences:
"""


def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
"""
Return list of NCBI Dataset report field mnemonics for fields that we want
to parse out of the dataset report. The column names in the output TSV
are different from the mnemonics.
See NCBI Dataset docs for full list of available fields and their column
names in the output:
https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
"""
fields = [
"accession",
"sourcedb",
"isolate-lineage",
"geo-region",
"geo-location",
"isolate-collection-date",
"release-date",
"update-date",
"length",
"host-name",
"isolate-lineage-source",
"bioprojects",
"biosample-acc",
"sra-accs",
"submitter-names",
"submitter-affiliation",
]
return ",".join(fields)


rule format_ncbi_dataset_report:
# Formats the headers to match the NCBI mnemonic names
input:
dataset_package="data/ncbi_dataset.zip",
ncbi_field_map=config["ncbi_field_map"],
output:
ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
params:
fields_to_include=_get_ncbi_dataset_field_mnemonics,
ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
benchmark:
"benchmarks/format_ncbi_dataset_report.txt"
shell:
"""
dataformat tsv virus-genome \
--package {input.dataset_package} \
--fields {params.fields_to_include:q} \
| csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
| csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
| tsv-select -H -f accession --rest last \
--fields {params.ncbi_datasets_fields:q} \
--elide-header \
| csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
> {output.ncbi_dataset_tsv}
"""

Expand All @@ -113,7 +80,7 @@ rule format_ncbi_datasets_ndjson:
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column accession-rev \
--seq-id-column accession \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
Expand Down
163 changes: 83 additions & 80 deletions phylogenetic/config/dropped_strains.txt
Original file line number Diff line number Diff line change
@@ -1,87 +1,90 @@
MG827392
KX369547 # PF13/251013_18 # reference included in config/zika_reference.gb
KY553111 # AFMC_U # too basal
KY962729 # AFMC_S # too basal
KY120353 # Boracay/16423 # too basal
KU179098 # JMB_185 # too basal
KU681082 # PHL/2012/CPC_0740 # too basal
MG827392.1
KX369547.1 # PF13/251013_18 # reference included in config/zika_reference.gb
KY553111.1 # AFMC_U # too basal
KY962729.1 # AFMC_S # too basal
KY120353.1 # Boracay/16423 # too basal
KU179098.1 # JMB_185 # too basal
KU681082.1 # PHL/2012/CPC_0740 # too basal
VIE/Bra/2016 # too basal
KU853013 # Dominican_Republic/2016/PD2 # duplicate of other strain in dataset
KU740184 # GD01 # duplicate of other strain in dataset
KU761564 # GDZ16001 # duplicate of other strain in dataset
KX893855 # VEN/UF_2/2016 # duplicate of other strain in dataset
KY927808 # ZZ_1 # duplicate of other strain in dataset
KY003154 # VR10599/Pavia/2016 # export with unknown origin
KY003153 # 34997/Pavia/2016 # export with unknown origin
MF574552 # COL/FLR_00001/2015 # duplicate of COL/FLR/2015
MF574559 # COL/FLR_00002/2015 # duplicate of COL/FLR/2015
MF574560 # COL/FLR_00003/2015 # duplicate of COL/FLR/2015
MF574561 # COL/FLR_00004/2015 # duplicate of COL/FLR/2015
MF574571 # COL/FLR_00005/2015 # duplicate of COL/FLR/2015
MF574555 # COL/FLR_00006/2015 # duplicate of COL/FLR/2015
MF574557 # COL/FLR_00007/2015 # duplicate of COL/FLR/2015
MF574562 # COL/FLR_00008/2015 # duplicate of COL/FLR/2015
MF574572 # COL/FLR_00009/2015 # duplicate of COL/FLR/2015
MF574570 # COL/FLR_00010/2015 # duplicate of COL/FLR/2015
MF574565 # COL/FLR_00011/2015 # duplicate of COL/FLR/2015
MF574568 # COL/FLR_00012/2015 # duplicate of COL/FLR/2015
MF574558 # COL/FLR_00013/2015 # duplicate of COL/FLR/2015
MF574576 # COL/FLR_00014/2015 # duplicate of COL/FLR/2015
MF574567 # COL/FLR_00015/2015 # duplicate of COL/FLR/2015
MF574575 # COL/FLR_00016/2015 # duplicate of COL/FLR/2015
MF574553 # COL/FLR_00017/2015 # duplicate of COL/FLR/2015
MF574573 # COL/FLR_00018/2015 # duplicate of COL/FLR/2015
MF574574 # COL/FLR_00019/2015 # duplicate of COL/FLR/2015
MF574577 # COL/FLR_00020/2015 # duplicate of COL/FLR/2015
MF574556 # COL/FLR_00021/2015 # duplicate of COL/FLR/2015
MF574554 # COL/FLR_00022/2015 # duplicate of COL/FLR/2015
MF574566 # COL/FLR_00023/2015 # duplicate of COL/FLR/2015
MF574569 # COL/FLR_00024/2015 # duplicate of COL/FLR/2015
MF574563 # COL/FLR_00025/2015 # duplicate of COL/FLR/2015
MF574564 # COL/FLR_00026/2015 # duplicate of COL/FLR/2015
MF574581 # COL/FLR_00034/2015 # duplicate of COL/FLR/2015
MF574588 # COL/FLR_00035/2015 # duplicate of COL/FLR/2015
MF574582 # COL/FLR_00036/2015 # duplicate of COL/FLR/2015
MF574586 # COL/FLR_00038/2015 # duplicate of COL/FLR/2015
MF574584 # COL/FLR_00040/2015 # duplicate of COL/FLR/2015
MF574583 # COL/FLR_00041/2015 # duplicate of COL/FLR/2015
MF574580 # COL/FLR_00042/2015 # duplicate of COL/FLR/2015
MF574579 # COL/PRV_00027/2015 # misdated
MF574578 # COL/PRV_00028/2015 # misdated
MF574585 # COL/PAN_00029/2015 # misdated
MF574587 # COL/PAN_00030/2015 # misdated
KY785436 # BRA/2016/FC_DQ12D1 # large indel
KY559010 # Brazil/2016/ZBRX8 # large indel
KY559011 # Brazil/2016/ZBRX11 # large indel
KX986761 # CX17 # large indel
MF801405 # MEX/2016/mex27 # large indel
MF801424 # MEX/2016/mex50 # large indel
MF801377 # SLV/2016/ElSalvador_1055 # large indel
KU853013.1 # Dominican_Republic/2016/PD2 # duplicate of other strain in dataset
KU740184.1 # GD01 # duplicate of other strain in dataset
KU761564.1 # GDZ16001 # duplicate of other strain in dataset
KX893855.1 # VEN/UF_2/2016 # duplicate of other strain in dataset
KY927808.1 # ZZ_1 # duplicate of other strain in dataset
KY003154.1 # VR10599/Pavia/2016 # export with unknown origin
KY003153.1 # 34997/Pavia/2016 # export with unknown origin
MF574552.1 # COL/FLR_00001/2015 # duplicate of COL/FLR/2015
MF574559.1 # COL/FLR_00002/2015 # duplicate of COL/FLR/2015
MF574560.1 # COL/FLR_00003/2015 # duplicate of COL/FLR/2015
MF574561.1 # COL/FLR_00004/2015 # duplicate of COL/FLR/2015
MF574571.1 # COL/FLR_00005/2015 # duplicate of COL/FLR/2015
MF574555.1 # COL/FLR_00006/2015 # duplicate of COL/FLR/2015
MF574557.1 # COL/FLR_00007/2015 # duplicate of COL/FLR/2015
MF574562.1 # COL/FLR_00008/2015 # duplicate of COL/FLR/2015
MF574572.1 # COL/FLR_00009/2015 # duplicate of COL/FLR/2015
MF574570.1 # COL/FLR_00010/2015 # duplicate of COL/FLR/2015
MF574565.1 # COL/FLR_00011/2015 # duplicate of COL/FLR/2015
MF574568.1 # COL/FLR_00012/2015 # duplicate of COL/FLR/2015
MF574558.1 # COL/FLR_00013/2015 # duplicate of COL/FLR/2015
MF574576.1 # COL/FLR_00014/2015 # duplicate of COL/FLR/2015
MF574567.1 # COL/FLR_00015/2015 # duplicate of COL/FLR/2015
MF574575.1 # COL/FLR_00016/2015 # duplicate of COL/FLR/2015
MF574553.1 # COL/FLR_00017/2015 # duplicate of COL/FLR/2015
MF574573.1 # COL/FLR_00018/2015 # duplicate of COL/FLR/2015
MF574574.1 # COL/FLR_00019/2015 # duplicate of COL/FLR/2015
MF574577.1 # COL/FLR_00020/2015 # duplicate of COL/FLR/2015
MF574556.1 # COL/FLR_00021/2015 # duplicate of COL/FLR/2015
MF574554.1 # COL/FLR_00022/2015 # duplicate of COL/FLR/2015
MF574566.1 # COL/FLR_00023/2015 # duplicate of COL/FLR/2015
MF574569.1 # COL/FLR_00024/2015 # duplicate of COL/FLR/2015
MF574563.1 # COL/FLR_00025/2015 # duplicate of COL/FLR/2015
MF574564.1 # COL/FLR_00026/2015 # duplicate of COL/FLR/2015
MF574581.1 # COL/FLR_00034/2015 # duplicate of COL/FLR/2015
MF574588.1 # COL/FLR_00035/2015 # duplicate of COL/FLR/2015
MF574582.1 # COL/FLR_00036/2015 # duplicate of COL/FLR/2015
MF574586.1 # COL/FLR_00038/2015 # duplicate of COL/FLR/2015
MF574584.1 # COL/FLR_00040/2015 # duplicate of COL/FLR/2015
MF574583.1 # COL/FLR_00041/2015 # duplicate of COL/FLR/2015
MF574580.1 # COL/FLR_00042/2015 # duplicate of COL/FLR/2015
MF574579.1 # COL/PRV_00027/2015 # misdated
MF574578.1 # COL/PRV_00028/2015 # misdated
MF574585.1 # COL/PAN_00029/2015 # misdated
MF574587.1 # COL/PAN_00030/2015 # misdated
KY785436.1 # BRA/2016/FC_DQ12D1 # large indel
KY559010.1 # Brazil/2016/ZBRX8 # large indel
KY559011.1 # Brazil/2016/ZBRX11 # large indel
KX986761.1 # CX17 # large indel
MF801405.1 # MEX/2016/mex27 # large indel
MF801424.1 # MEX/2016/mex50 # large indel
MF801377.1 # SLV/2016/ElSalvador_1055 # large indel
VI20_12plex # USVI/20/2016 # large indel
USVI/21/2016 # large indel
VI23_12plex # USVI/23/2016 # large indel
VI27_1d # USVI/27/2016 # large indel
VI30_1d # USVI/30/2016 # large indel
VI32_12plex # USVI/32/2016 # large indel
KY126351 # Thailand/1605aTw # excess divergence
KU744693 # VE_Ganxian # excess divergence
KY328290 # ZK_YN001 # excess divergence
KY415986 # Haiti/0029/2014 # contamination present
KY415987 # Haiti/0033/2014 # contamination present
KY415990 # Haiti/0036/2014 # contamination present
KY415988 # Haiti/0054/2014 # contamination present
KY415989 # Haiti/0074/2014 # contamination present
KY415991 # Haiti/0097/2014 # contamination present
MF384325 # mosquito/Haiti/1682/2016 # contamination present
ZF36_36S # contamination present
MK105975 # MR766 # lab strain
KX856011 # Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016
MK028857 # Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59
MN025403 # V15555 # highly diverged
MT505349 # DK # lab strain
MT505350 # DK23 # lab strain
MW680969 # rGZ02a/2018 # highly diverged
MW680970 # rGZ02p/2018 # highly diverged
OK054351 # V211784 # highly diverged
MT478034 # LMM/AG5643
OL414716 # Faranah/18
KY126351.1 # Thailand/1605aTw # excess divergence
KU744693.1 # VE_Ganxian # excess divergence
KY328290.1 # ZK_YN001 # excess divergence
KY415986.1 # Haiti/0029/2014 # contamination present
KY415987.1 # Haiti/0033/2014 # contamination present
KY415990.1 # Haiti/0036/2014 # contamination present
KY415988.1 # Haiti/0054/2014 # contamination present
KY415989.1 # Haiti/0074/2014 # contamination present
KY415991.1 # Haiti/0097/2014 # contamination present
MF384325.1 # mosquito/Haiti/1682/2016 # contamination present
ZF36_36S.1 # contamination present
MK105975.1 # MR766 # lab strain
KX856011.1 # Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016
MK028857.1 # Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59
MN025403.1 # V15555 # highly diverged
MT505349.1 # DK # lab strain
MT505350.1 # DK23 # lab strain
MW680969.1 # rGZ02a/2018 # highly diverged
MW680970.1 # rGZ02p/2018 # highly diverged
OK054351.1 # V211784 # highly diverged
MT478034.1 # LMM/AG5643
OL414716.1 # Faranah/18
MT505349.1 # Synthetic
MT505350.1 # Synthetic
MK105975.1 # Highly diverged

0 comments on commit 9605bf5

Please sign in to comment.