Skip to content

Commit

Permalink
Modernize ncbi-field-map in config
Browse files Browse the repository at this point in the history
Match pathogen-repo-guide
nextstrain/pathogen-repo-guide@d9751bb
  • Loading branch information
j23414 committed Mar 18, 2024
1 parent c73c566 commit 7392f61
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 18 deletions.
33 changes: 16 additions & 17 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,22 @@ curate:
# NCBI Fields to rename to Nextstrain field names.
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: [
'accession=genbank_accession',
'accession-rev=genbank_accession_rev',
'isolate-lineage=strain',
'sourcedb=database', # necessary for applying geo location rules
'geo-region=region',
'geo-location=location',
'host-name=host',
'isolate-collection-date=date',
'release-date=release_date',
'update-date=update_date',
'virus-tax-id=virus_tax_id',
'virus-name=virus_name',
'sra-accs=sra_accessions',
'submitter-names=authors',
'submitter-affiliation=institution',
]
field_map:
accession: genbank_accession
accession-rev: genbank_accession_rev
isolate-lineage: strain
sourcedb: database
geo-region: region
geo-location: location
host-name: host
isolate-collection-date: date
release-date: release_date
update-date: update_date
virus-tax-id: virus_tax_id
virus-name: virus_name
sra-accs: sra_accessions
submitter-names: authors
submitter-affiliation: institution
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
strain_regex: '^.+$'
Expand Down
9 changes: 8 additions & 1 deletion ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ rule concat_geolocation_rules:
"""


def format_field_map(field_map: dict[str, str]) -> str:
"""
Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands.
"""
return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])


rule curate:
input:
sequences_ndjson="data/sequences.ndjson",
Expand All @@ -47,7 +54,7 @@ rule curate:
log:
"logs/curate.txt",
params:
field_map=config["curate"]["field_map"],
field_map=format_field_map(config["curate"]["field_map"]),
strain_regex=config["curate"]["strain_regex"],
strain_backup_fields=config["curate"]["strain_backup_fields"],
date_fields=config["curate"]["date_fields"],
Expand Down

0 comments on commit 7392f61

Please sign in to comment.