diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index fdf5cb4..927bd75 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -2,8 +2,27 @@ sources: ['genbank'] # Pathogen NCBI Taxonomy ID ncbi_taxon_id: '64320' -# Renames the NCBI dataset headers -ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv' +# The list of NCBI Datasets fields to include from NCBI Datasets output +# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields +# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# Note: the "accession" field MUST be provided to match with the sequences +ncbi_datasets_fields: + - accession + - sourcedb + - sra-accs + - isolate-lineage + - geo-region + - geo-location + - isolate-collection-date + - release-date + - update-date + - length + - host-name + - isolate-lineage-source + - biosample-acc + - submitter-names + - submitter-affiliation + - submitter-country # Params for the transform rule transform: diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv deleted file mode 100644 index 57b4f8c..0000000 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ /dev/null @@ -1,17 +0,0 @@ -# Maps the NCBI output TSV column names back to the NCBI mnemonics. -# This list should match the list in -# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics -key value -Accession accession-rev -Source database sourcedb -Isolate Lineage isolate-lineage -Geographic Region geo-region -Geographic Location geo-location -Isolate Collection date isolate-collection-date -Release date release-date -Update date update-date -Length length -Host Name host-name -SRA Accessions sra-accs -Submitter Names submitter-names -Submitter Affiliation submitter-affiliation diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 8d27193..5d42d76 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -44,56 +44,23 @@ rule extract_ncbi_dataset_sequences: """ -def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: - """ - Return list of NCBI Dataset report field mnemonics for fields that we want - to parse out of the dataset report. The column names in the output TSV - are different from the mnemonics. - - See NCBI Dataset docs for full list of available fields and their column - names in the output: - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields - """ - fields = [ - "accession", - "sourcedb", - "isolate-lineage", - "geo-region", - "geo-location", - "isolate-collection-date", - "release-date", - "update-date", - "length", - "host-name", - "isolate-lineage-source", - "bioprojects", - "biosample-acc", - "sra-accs", - "submitter-names", - "submitter-affiliation", - ] - return ",".join(fields) - - rule format_ncbi_dataset_report: # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", - ncbi_field_map=config["ncbi_field_map"], output: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: - fields_to_include=_get_ncbi_dataset_field_mnemonics, + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), benchmark: "benchmarks/format_ncbi_dataset_report.txt" shell: """ dataformat tsv virus-genome \ --package {input.dataset_package} \ - --fields {params.fields_to_include:q} \ - | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ - | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ - | tsv-select -H -f accession --rest last \ + --fields {params.ncbi_datasets_fields:q} \ + --elide-header \ + | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \ > {output.ncbi_dataset_tsv} """ @@ -113,7 +80,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column accession-rev \ + --seq-id-column accession \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \ diff --git a/phylogenetic/config/dropped_strains.txt b/phylogenetic/config/dropped_strains.txt index 746e1ba..35df1e7 100644 --- a/phylogenetic/config/dropped_strains.txt +++ b/phylogenetic/config/dropped_strains.txt @@ -1,87 +1,90 @@ -MG827392 -KX369547 # PF13/251013_18 # reference included in config/zika_reference.gb -KY553111 # AFMC_U # too basal -KY962729 # AFMC_S # too basal -KY120353 # Boracay/16423 # too basal -KU179098 # JMB_185 # too basal -KU681082 # PHL/2012/CPC_0740 # too basal +MG827392.1 +KX369547.1 # PF13/251013_18 # reference included in config/zika_reference.gb +KY553111.1 # AFMC_U # too basal +KY962729.1 # AFMC_S # too basal +KY120353.1 # Boracay/16423 # too basal +KU179098.1 # JMB_185 # too basal +KU681082.1 # PHL/2012/CPC_0740 # too basal VIE/Bra/2016 # too basal -KU853013 # Dominican_Republic/2016/PD2 # duplicate of other strain in dataset -KU740184 # GD01 # duplicate of other strain in dataset -KU761564 # GDZ16001 # duplicate of other strain in dataset -KX893855 # VEN/UF_2/2016 # duplicate of other strain in dataset -KY927808 # ZZ_1 # duplicate of other strain in dataset -KY003154 # VR10599/Pavia/2016 # export with unknown origin -KY003153 # 34997/Pavia/2016 # export with unknown origin -MF574552 # COL/FLR_00001/2015 # duplicate of COL/FLR/2015 -MF574559 # COL/FLR_00002/2015 # duplicate of COL/FLR/2015 -MF574560 # COL/FLR_00003/2015 # duplicate of COL/FLR/2015 -MF574561 # COL/FLR_00004/2015 # duplicate of COL/FLR/2015 -MF574571 # COL/FLR_00005/2015 # duplicate of COL/FLR/2015 -MF574555 # COL/FLR_00006/2015 # duplicate of COL/FLR/2015 -MF574557 # COL/FLR_00007/2015 # duplicate of COL/FLR/2015 -MF574562 # COL/FLR_00008/2015 # duplicate of COL/FLR/2015 -MF574572 # COL/FLR_00009/2015 # duplicate of COL/FLR/2015 -MF574570 # COL/FLR_00010/2015 # duplicate of COL/FLR/2015 -MF574565 # COL/FLR_00011/2015 # duplicate of COL/FLR/2015 -MF574568 # COL/FLR_00012/2015 # duplicate of COL/FLR/2015 -MF574558 # COL/FLR_00013/2015 # duplicate of COL/FLR/2015 -MF574576 # COL/FLR_00014/2015 # duplicate of COL/FLR/2015 -MF574567 # COL/FLR_00015/2015 # duplicate of COL/FLR/2015 -MF574575 # COL/FLR_00016/2015 # duplicate of COL/FLR/2015 -MF574553 # COL/FLR_00017/2015 # duplicate of COL/FLR/2015 -MF574573 # COL/FLR_00018/2015 # duplicate of COL/FLR/2015 -MF574574 # COL/FLR_00019/2015 # duplicate of COL/FLR/2015 -MF574577 # COL/FLR_00020/2015 # duplicate of COL/FLR/2015 -MF574556 # COL/FLR_00021/2015 # duplicate of COL/FLR/2015 -MF574554 # COL/FLR_00022/2015 # duplicate of COL/FLR/2015 -MF574566 # COL/FLR_00023/2015 # duplicate of COL/FLR/2015 -MF574569 # COL/FLR_00024/2015 # duplicate of COL/FLR/2015 -MF574563 # COL/FLR_00025/2015 # duplicate of COL/FLR/2015 -MF574564 # COL/FLR_00026/2015 # duplicate of COL/FLR/2015 -MF574581 # COL/FLR_00034/2015 # duplicate of COL/FLR/2015 -MF574588 # COL/FLR_00035/2015 # duplicate of COL/FLR/2015 -MF574582 # COL/FLR_00036/2015 # duplicate of COL/FLR/2015 -MF574586 # COL/FLR_00038/2015 # duplicate of COL/FLR/2015 -MF574584 # COL/FLR_00040/2015 # duplicate of COL/FLR/2015 -MF574583 # COL/FLR_00041/2015 # duplicate of COL/FLR/2015 -MF574580 # COL/FLR_00042/2015 # duplicate of COL/FLR/2015 -MF574579 # COL/PRV_00027/2015 # misdated -MF574578 # COL/PRV_00028/2015 # misdated -MF574585 # COL/PAN_00029/2015 # misdated -MF574587 # COL/PAN_00030/2015 # misdated -KY785436 # BRA/2016/FC_DQ12D1 # large indel -KY559010 # Brazil/2016/ZBRX8 # large indel -KY559011 # Brazil/2016/ZBRX11 # large indel -KX986761 # CX17 # large indel -MF801405 # MEX/2016/mex27 # large indel -MF801424 # MEX/2016/mex50 # large indel -MF801377 # SLV/2016/ElSalvador_1055 # large indel +KU853013.1 # Dominican_Republic/2016/PD2 # duplicate of other strain in dataset +KU740184.1 # GD01 # duplicate of other strain in dataset +KU761564.1 # GDZ16001 # duplicate of other strain in dataset +KX893855.1 # VEN/UF_2/2016 # duplicate of other strain in dataset +KY927808.1 # ZZ_1 # duplicate of other strain in dataset +KY003154.1 # VR10599/Pavia/2016 # export with unknown origin +KY003153.1 # 34997/Pavia/2016 # export with unknown origin +MF574552.1 # COL/FLR_00001/2015 # duplicate of COL/FLR/2015 +MF574559.1 # COL/FLR_00002/2015 # duplicate of COL/FLR/2015 +MF574560.1 # COL/FLR_00003/2015 # duplicate of COL/FLR/2015 +MF574561.1 # COL/FLR_00004/2015 # duplicate of COL/FLR/2015 +MF574571.1 # COL/FLR_00005/2015 # duplicate of COL/FLR/2015 +MF574555.1 # COL/FLR_00006/2015 # duplicate of COL/FLR/2015 +MF574557.1 # COL/FLR_00007/2015 # duplicate of COL/FLR/2015 +MF574562.1 # COL/FLR_00008/2015 # duplicate of COL/FLR/2015 +MF574572.1 # COL/FLR_00009/2015 # duplicate of COL/FLR/2015 +MF574570.1 # COL/FLR_00010/2015 # duplicate of COL/FLR/2015 +MF574565.1 # COL/FLR_00011/2015 # duplicate of COL/FLR/2015 +MF574568.1 # COL/FLR_00012/2015 # duplicate of COL/FLR/2015 +MF574558.1 # COL/FLR_00013/2015 # duplicate of COL/FLR/2015 +MF574576.1 # COL/FLR_00014/2015 # duplicate of COL/FLR/2015 +MF574567.1 # COL/FLR_00015/2015 # duplicate of COL/FLR/2015 +MF574575.1 # COL/FLR_00016/2015 # duplicate of COL/FLR/2015 +MF574553.1 # COL/FLR_00017/2015 # duplicate of COL/FLR/2015 +MF574573.1 # COL/FLR_00018/2015 # duplicate of COL/FLR/2015 +MF574574.1 # COL/FLR_00019/2015 # duplicate of COL/FLR/2015 +MF574577.1 # COL/FLR_00020/2015 # duplicate of COL/FLR/2015 +MF574556.1 # COL/FLR_00021/2015 # duplicate of COL/FLR/2015 +MF574554.1 # COL/FLR_00022/2015 # duplicate of COL/FLR/2015 +MF574566.1 # COL/FLR_00023/2015 # duplicate of COL/FLR/2015 +MF574569.1 # COL/FLR_00024/2015 # duplicate of COL/FLR/2015 +MF574563.1 # COL/FLR_00025/2015 # duplicate of COL/FLR/2015 +MF574564.1 # COL/FLR_00026/2015 # duplicate of COL/FLR/2015 +MF574581.1 # COL/FLR_00034/2015 # duplicate of COL/FLR/2015 +MF574588.1 # COL/FLR_00035/2015 # duplicate of COL/FLR/2015 +MF574582.1 # COL/FLR_00036/2015 # duplicate of COL/FLR/2015 +MF574586.1 # COL/FLR_00038/2015 # duplicate of COL/FLR/2015 +MF574584.1 # COL/FLR_00040/2015 # duplicate of COL/FLR/2015 +MF574583.1 # COL/FLR_00041/2015 # duplicate of COL/FLR/2015 +MF574580.1 # COL/FLR_00042/2015 # duplicate of COL/FLR/2015 +MF574579.1 # COL/PRV_00027/2015 # misdated +MF574578.1 # COL/PRV_00028/2015 # misdated +MF574585.1 # COL/PAN_00029/2015 # misdated +MF574587.1 # COL/PAN_00030/2015 # misdated +KY785436.1 # BRA/2016/FC_DQ12D1 # large indel +KY559010.1 # Brazil/2016/ZBRX8 # large indel +KY559011.1 # Brazil/2016/ZBRX11 # large indel +KX986761.1 # CX17 # large indel +MF801405.1 # MEX/2016/mex27 # large indel +MF801424.1 # MEX/2016/mex50 # large indel +MF801377.1 # SLV/2016/ElSalvador_1055 # large indel VI20_12plex # USVI/20/2016 # large indel USVI/21/2016 # large indel VI23_12plex # USVI/23/2016 # large indel VI27_1d # USVI/27/2016 # large indel VI30_1d # USVI/30/2016 # large indel VI32_12plex # USVI/32/2016 # large indel -KY126351 # Thailand/1605aTw # excess divergence -KU744693 # VE_Ganxian # excess divergence -KY328290 # ZK_YN001 # excess divergence -KY415986 # Haiti/0029/2014 # contamination present -KY415987 # Haiti/0033/2014 # contamination present -KY415990 # Haiti/0036/2014 # contamination present -KY415988 # Haiti/0054/2014 # contamination present -KY415989 # Haiti/0074/2014 # contamination present -KY415991 # Haiti/0097/2014 # contamination present -MF384325 # mosquito/Haiti/1682/2016 # contamination present -ZF36_36S # contamination present -MK105975 # MR766 # lab strain -KX856011 # Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016 -MK028857 # Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59 -MN025403 # V15555 # highly diverged -MT505349 # DK # lab strain -MT505350 # DK23 # lab strain -MW680969 # rGZ02a/2018 # highly diverged -MW680970 # rGZ02p/2018 # highly diverged -OK054351 # V211784 # highly diverged -MT478034 # LMM/AG5643 -OL414716 # Faranah/18 +KY126351.1 # Thailand/1605aTw # excess divergence +KU744693.1 # VE_Ganxian # excess divergence +KY328290.1 # ZK_YN001 # excess divergence +KY415986.1 # Haiti/0029/2014 # contamination present +KY415987.1 # Haiti/0033/2014 # contamination present +KY415990.1 # Haiti/0036/2014 # contamination present +KY415988.1 # Haiti/0054/2014 # contamination present +KY415989.1 # Haiti/0074/2014 # contamination present +KY415991.1 # Haiti/0097/2014 # contamination present +MF384325.1 # mosquito/Haiti/1682/2016 # contamination present +ZF36_36S.1 # contamination present +MK105975.1 # MR766 # lab strain +KX856011.1 # Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016 +MK028857.1 # Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59 +MN025403.1 # V15555 # highly diverged +MT505349.1 # DK # lab strain +MT505350.1 # DK23 # lab strain +MW680969.1 # rGZ02a/2018 # highly diverged +MW680970.1 # rGZ02p/2018 # highly diverged +OK054351.1 # V211784 # highly diverged +MT478034.1 # LMM/AG5643 +OL414716.1 # Faranah/18 +MT505349.1 # Synthetic +MT505350.1 # Synthetic +MK105975.1 # Highly diverged \ No newline at end of file