diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index b85af3c4..1981f980 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -10,14 +10,14 @@ transform: # Fields to rename. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution'] + field_map: ['collected=date', 'released=date_released, 'genbank_accession=accession', 'submitting_organization=institution'] # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names strain_regex: '^.+$' # Back up strain name field if 'strain' doesn't match regex above strain_backup_fields: ['accession'] # List of date fields to standardize - date_fields: ['date', 'date_submitted'] + date_fields: ['date', 'date_released'] # Expected date formats present in date fields # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes @@ -63,7 +63,7 @@ transform: 'division', 'location', 'host', - 'date_submitted', + 'date_released', 'sra_accession', 'abbr_authors', 'authors', diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv index eb794184..00c9db88 100644 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -5,7 +5,7 @@ Isolate Lineage strain Geographic Region region Geographic Location location Isolate Collection date collected -Release date submitted +Release date released Update date updated Length length Host Name host