diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 0213ee5b..6a748aa3 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -28,23 +28,22 @@ curate: # NCBI Fields to rename to Nextstrain field names. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: [ - 'accession=genbank_accession', - 'accession-rev=genbank_accession_rev', - 'isolate-lineage=strain', - 'sourcedb=database', # necessary for applying geo location rules - 'geo-region=region', - 'geo-location=location', - 'host-name=host', - 'isolate-collection-date=date', - 'release-date=release_date', - 'update-date=update_date', - 'virus-tax-id=virus_tax_id', - 'virus-name=virus_name', - 'sra-accs=sra_accessions', - 'submitter-names=authors', - 'submitter-affiliation=institution', - ] + field_map: + accession: genbank_accession + accession-rev: genbank_accession_rev + isolate-lineage: strain + sourcedb: database + geo-region: region + geo-location: location + host-name: host + isolate-collection-date: date + release-date: release_date + update-date: update_date + virus-tax-id: virus_tax_id + virus-name: virus_name + sra-accs: sra_accessions + submitter-names: authors + submitter-affiliation: institution # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names strain_regex: '^.+$' diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 8adba409..6d5374d3 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -36,6 +36,13 @@ rule concat_geolocation_rules: """ +def format_field_map(field_map: dict[str, str]) -> str: + """ + Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands. + """ + return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()]) + + rule curate: input: sequences_ndjson="data/sequences.ndjson", @@ -47,7 +54,7 @@ rule curate: log: "logs/curate.txt", params: - field_map=config["curate"]["field_map"], + field_map=format_field_map(config["curate"]["field_map"]), strain_regex=config["curate"]["strain_regex"], strain_backup_fields=config["curate"]["strain_backup_fields"], date_fields=config["curate"]["date_fields"],