From 65733336b393e8b776b3906698834fb2afc3c628 Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 18 Jan 2023 15:20:36 -0800 Subject: [PATCH 01/12] curate: Copy `transform-date-fields` from monkeypox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copied the `transform-date-fields` script from monekypox/ingest¹ as the base of the new subcommand `augur curate format-dates`. Copied unmodified for now so that we can see modifications in version control history. ¹ https://github.com/nextstrain/monkeypox/blob/8cc4cf739b9af679e1d2fe29d9f702f42f60e636/ingest/bin/transform-date-fields --- augur/curate/format_dates.py | 154 +++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 augur/curate/format_dates.py diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py new file mode 100644 index 000000000..4ff2a6981 --- /dev/null +++ b/augur/curate/format_dates.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Standardizes format of date fields of the NDJSON record from stdin to +ISO 8601 date (YYYY-MM-DD) and outputs modified records to stdout. +""" +import argparse +import json +from datetime import datetime +from sys import stderr, stdin, stdout + + +def format_date(date_string: str, expected_formats: list) -> str: + """ + Originally from nextstrain/ncov-ingest + + Format *date_string* to ISO 8601 date (YYYY-MM-DD). + If *date_string* does not match *expected_formats*, return *date_string*. + If *date_string* is missing the year, return masked date 'XXXX-XX-XX'. + If *date_string* is an incomplete date (i.e. missing month or day), then + missing values are masked with 'XX'. + + >>> expected_formats = ['%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d'] + + >>> format_date("01-01", expected_formats) + 'XXXX-XX-XX' + + >>> format_date("2020", expected_formats) + '2020-XX-XX' + + >>> format_date("2020-01", expected_formats) + '2020-01-XX' + + >>> format_date("2020-1-15", expected_formats) + '2020-01-15' + + >>> format_date("2020-1-1", expected_formats) + '2020-01-01' + + >>> format_date("2020-01-15", expected_formats) + '2020-01-15' + + >>> format_date("2020-01-15T00:00:00Z", expected_formats) + '2020-01-15' + """ + # Potential directives that datetime accepts that can return the correct year, month, day fields + # see https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes + # + # Allows us to check if year/month/day are included in the date format so we + # know when to mask incomplete dates with 'XX' + all_field_directives = {'%c', '%x', + ('%G', '%V', '%A'), ('%G', '%V', '%a'), ('%G', '%V', '%w'), ('%G', '%V', '%u') + } + month_and_day_directives = {'%j', + ('%U', '%A'), ('%U', '%a'), ('%U', '%w'), ('%U', '%u'), + ('%W', '%A'), ('%W', '%a'), ('%W', '%w'), ('%W', '%u') + } + year_directives = {'%y', '%Y'} + month_directives = {'%b', '%B', '%m'} + day_directives = {'%d'} + + def directive_is_included(potential_directives: set, date_format: str) -> bool: + """ + Checks if any of the directives in *potential_directives* is included + in *date_format* string. + + If an element within *potential_directives* is a tuple, then all directives + within the tuple must be included in *date_format*. + """ + return any( + ( + (isinstance(directive, str) and directive in date_format) or + (isinstance(directive, tuple) and all(sub_directive in date_format for sub_directive in directive)) + ) + for directive in potential_directives + ) + + for date_format in expected_formats: + try: + parsed_date = datetime.strptime(date_string, date_format) + except ValueError: + continue + + # Default to date masked as 'XXXX-XX-XX' so we don't return incorrect dates + year_string = 'XXXX' + month_string = day_string = 'XX' + + parsed_year_string = str(parsed_date.year) + parsed_month_string = str(parsed_date.month).zfill(2) + parsed_day_string = str(parsed_date.day).zfill(2) + + # If a directive for ALL fields is included in date format, + # then use all of the parsed field strings + if (directive_is_included(all_field_directives, date_format)): + year_string = parsed_year_string + month_string = parsed_month_string + day_string = parsed_day_string + + # If not all fields directives are included, then check year + # directive was included in date format + elif(directive_is_included(year_directives, date_format)): + year_string = parsed_year_string + + # Only check for month and day directives if year is included + # Check if directive for BOTH month and year is included in date format + if (directive_is_included(month_and_day_directives, date_format)): + month_string = parsed_month_string + day_string = parsed_day_string + + # If not directives for BOTH month and day are included, then check + # month directive was included in date format + elif(directive_is_included(month_directives, date_format)): + month_string = parsed_month_string + + # Only check for day directives if month is included + if(directive_is_included(day_directives, date_format)): + day_string = parsed_day_string + + return f"{year_string}-{month_string}-{day_string}" + + if date_string: + print( + f"WARNING: Unable to transform date string {date_string!r} because it does not match", + f"any of the expected formats {expected_formats}.", + file=stderr + ) + + return date_string + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--date-fields", nargs="+", + help="List of date field names in the NDJSON record that need to be standardized.") + parser.add_argument("--expected-date-formats", nargs="+", + help="Expected date formats that are currently in the provided date fields." + + "If a date string matches multiple formats, it will be parsed as the first format in the list.") + + args = parser.parse_args() + + expected_formats = args.expected_date_formats + + for record in stdin: + record = json.loads(record) + + for field in args.date_fields: + date_string = record.get(field) + if date_string: + record[field] = format_date(date_string, expected_formats) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() From c7a4df32447124b9ba5e09e80a6e43486c3906ee Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 18 Jan 2023 16:32:48 -0800 Subject: [PATCH 02/12] curate format-dates: edits to fit into augur No changes to the internals of the script, just edits to fit into the Augur ecosystem: * add `register_parser` and `run` functions * remove shebang * docstring edits * move nested function out for additional doctests * Cram tests for the new subcommand --- augur/curate/__init__.py | 3 +- augur/curate/format_dates.py | 147 ++++++++++++-------- tests/functional/curate/cram/format_dates.t | 52 +++++++ 3 files changed, 141 insertions(+), 61 deletions(-) create mode 100644 tests/functional/curate/cram/format_dates.t diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index 257ce2ff5..2d0f30b83 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -12,13 +12,14 @@ from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.io.sequences import write_records_to_fasta from augur.types import DataErrorMethod -from . import normalize_strings, passthru +from . import format_dates, normalize_strings, passthru SUBCOMMAND_ATTRIBUTE = '_curate_subcommand' SUBCOMMANDS = [ passthru, normalize_strings, + format_dates, ] diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index 4ff2a6981..d5de59eda 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -1,44 +1,103 @@ -#!/usr/bin/env python3 """ -Standardizes format of date fields of the NDJSON record from stdin to -ISO 8601 date (YYYY-MM-DD) and outputs modified records to stdout. +Format date fields to ISO 8601 dates (YYYY-MM-DD), where incomplete dates +are masked with 'XX' (e.g. 2023 -> 2023-XX-XX). """ -import argparse -import json from datetime import datetime -from sys import stderr, stdin, stdout +from augur.io.print import print_err -def format_date(date_string: str, expected_formats: list) -> str: - """ - Originally from nextstrain/ncov-ingest +def register_parser(parent_subparsers): + parser = parent_subparsers.add_parser("format-dates", + parents=[parent_subparsers.shared_parser], + help=__doc__) - Format *date_string* to ISO 8601 date (YYYY-MM-DD). - If *date_string* does not match *expected_formats*, return *date_string*. - If *date_string* is missing the year, return masked date 'XXXX-XX-XX'. - If *date_string* is an incomplete date (i.e. missing month or day), then - missing values are masked with 'XX'. + required = parser.add_argument_group(title="REQUIRED") + required.add_argument("--date-fields", nargs="+", + help="List of date field names in the record that need to be standardized.") + required.add_argument("--expected-date-formats", nargs="+", + help="Expected date formats that are currently in the provided date fields, " + + "defined by standard format codes as listed at " + + "https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes. " + + "If a date string matches multiple formats, it will be parsed as the first format in the list.") + return parser - >>> expected_formats = ['%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d'] +def directive_is_included(potential_directives, date_format): + """ + Checks if any of the directives in *potential_directives* is included + in *date_format* string. + + If an element within *potential_directives* is a tuple, then all directives + within the tuple must be included in *date_format*. + + Parameters + ---------- + potential_directives: set[str] or set[tuple[str, ...]] + Set of potential directives to check + date_format: str + Date format string to check for directives + + Returns + ------- + bool: + Whether the provided *date_format* includes any of the *potential_directives* + + + >>> potential_directives = {('%y', '%b', '%d'), ('%y', '%B', '%d'), ('%y', '%m', '%d'),} + >>> directive_is_included(potential_directives, '%G-%V-%A') + False + >>> directive_is_included(potential_directives, '%y-%m') + False + >>> directive_is_included(potential_directives, '%y-%m-%d') + True + >>> directive_is_included(potential_directives, '%y-%m-%dT%H:%M:%SZ') + True + """ + return any( + ( + (isinstance(directive, str) and directive in date_format) or + (isinstance(directive, tuple) and all(sub_directive in date_format for sub_directive in directive)) + ) + for directive in potential_directives + ) + + +def format_date(date_string, expected_formats): + """ + Format *date_string* to ISO 8601 date (YYYY-MM-DD) by trying to parse it + as one of the provided *expected_formats*. + + Parameters + ---------- + date_string: str + Date string to format + expected_formats: list[str] + List of expected formats for the provided date string + + Returns + ------- + str : + Formatted date string. + If *date_string* does not match *expected_formats*, returns original *date_string*. + If *date_string* is an incomplete date, the date is masked with 'XX'. + Dates without year will be formatted as 'XXXX-XX-XX', even if month/day are known. + Dates without month will be formatted as 'YYYY-XX-XX', even if day is known. + Dates without day will be formatted as 'YYYY-MM-XX'. + + + >>> expected_formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d'] >>> format_date("01-01", expected_formats) 'XXXX-XX-XX' - >>> format_date("2020", expected_formats) '2020-XX-XX' - >>> format_date("2020-01", expected_formats) '2020-01-XX' - >>> format_date("2020-1-15", expected_formats) '2020-01-15' - >>> format_date("2020-1-1", expected_formats) '2020-01-01' - >>> format_date("2020-01-15", expected_formats) '2020-01-15' - >>> format_date("2020-01-15T00:00:00Z", expected_formats) '2020-01-15' """ @@ -58,22 +117,6 @@ def format_date(date_string: str, expected_formats: list) -> str: month_directives = {'%b', '%B', '%m'} day_directives = {'%d'} - def directive_is_included(potential_directives: set, date_format: str) -> bool: - """ - Checks if any of the directives in *potential_directives* is included - in *date_format* string. - - If an element within *potential_directives* is a tuple, then all directives - within the tuple must be included in *date_format*. - """ - return any( - ( - (isinstance(directive, str) and directive in date_format) or - (isinstance(directive, tuple) and all(sub_directive in date_format for sub_directive in directive)) - ) - for directive in potential_directives - ) - for date_format in expected_formats: try: parsed_date = datetime.strptime(date_string, date_format) @@ -118,37 +161,21 @@ def directive_is_included(potential_directives: set, date_format: str) -> bool: return f"{year_string}-{month_string}-{day_string}" if date_string: - print( + print_err( f"WARNING: Unable to transform date string {date_string!r} because it does not match", - f"any of the expected formats {expected_formats}.", - file=stderr + f"any of the expected formats {expected_formats}." ) return date_string -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--date-fields", nargs="+", - help="List of date field names in the NDJSON record that need to be standardized.") - parser.add_argument("--expected-date-formats", nargs="+", - help="Expected date formats that are currently in the provided date fields." + - "If a date string matches multiple formats, it will be parsed as the first format in the list.") - - args = parser.parse_args() - - expected_formats = args.expected_date_formats - - for record in stdin: - record = json.loads(record) +def run(args, records): + for record in records: + record = record.copy() for field in args.date_fields: date_string = record.get(field) if date_string: - record[field] = format_date(date_string, expected_formats) + record[field] = format_date(date_string, args.expected_date_formats) - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() + yield record diff --git a/tests/functional/curate/cram/format_dates.t b/tests/functional/curate/cram/format_dates.t new file mode 100644 index 000000000..a7d929f2a --- /dev/null +++ b/tests/functional/curate/cram/format_dates.t @@ -0,0 +1,52 @@ +Setup + + $ pushd "$TESTDIR" > /dev/null + $ export AUGUR="${AUGUR:-../../../../bin/augur}" + +Create NDJSON file for testing format_dates with different forms + + $ cat >$TMP/records.ndjson <<~~ + > {"record": 1, "date": "2020", "collectionDate": "2020-01", "releaseDate": "2020-01","updateDate": "2020-07-18T00:00:00Z"} + > ~~ + +Test output with matching expected date formats. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%m" "%Y-%m-%dT%H:%M:%SZ" + {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01-XX", "releaseDate": "2020-01-XX", "updateDate": "2020-07-18"} + +Test output with unmatched expected date formats. +This is expected to output a warning and return the date strings in their original format. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" + WARNING: Unable to transform date string '2020-01' because it does not match any of the expected formats ['%Y', '%Y-%m-%dT%H:%M:%SZ']. + WARNING: Unable to transform date string '2020-01' because it does not match any of the expected formats ['%Y', '%Y-%m-%dT%H:%M:%SZ']. + {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01", "releaseDate": "2020-01", "updateDate": "2020-07-18"} + +Test output with multiple matching expected date formats. +Date with multiple matches will be parsed according to first matching format. +The "collectionDate" and "releaseDate" will match the first "%Y-%j" format, which is a complete date. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%j" "%Y-%m" "%Y-%m-%dT%H:%M:%SZ" + {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01-01", "releaseDate": "2020-01-01", "updateDate": "2020-07-18"} + +Test output with chained format-dates commands that parses different fields with different expected formats. +Since "collectionDate" and "releaseDate" have expected formats overlap, +we can split them into two chained commands that parses them with different expected formats to produce the desired results. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%m" "%Y-%m-%dT%H:%M:%SZ" \ + > | ${AUGUR} curate format-dates \ + > --date-field "collectionDate" \ + > --expected-date-formats "%Y-%j" + {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01-01", "releaseDate": "2020-01-XX", "updateDate": "2020-07-18"} From 57224190c633f2a012c77ec4f66a435ac83903f0 Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 18 Jan 2023 17:27:38 -0800 Subject: [PATCH 03/12] curate format-dates: explicitly list combinations of directives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes the cascade of directive checks to explicit sets of tuples of directives so that it is easier to read/understand as suggested by @tsibley in a previous review.¹ This also allows us to simplify the `directive_is_included` function to only expect tuples as potential directives.² ¹ https://github.com/nextstrain/monkeypox/pull/45#discussion_r898489288 ² https://github.com/nextstrain/monkeypox/pull/45#discussion_r898458449 --- augur/curate/format_dates.py | 75 ++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index d5de59eda..242dfd9db 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -32,7 +32,7 @@ def directive_is_included(potential_directives, date_format): Parameters ---------- - potential_directives: set[str] or set[tuple[str, ...]] + potential_directives: set[tuple[str, ...]] Set of potential directives to check date_format: str Date format string to check for directives @@ -54,10 +54,7 @@ def directive_is_included(potential_directives, date_format): True """ return any( - ( - (isinstance(directive, str) and directive in date_format) or - (isinstance(directive, tuple) and all(sub_directive in date_format for sub_directive in directive)) - ) + all(sub_directive in date_format for sub_directive in directive) for directive in potential_directives ) @@ -101,21 +98,34 @@ def format_date(date_string, expected_formats): >>> format_date("2020-01-15T00:00:00Z", expected_formats) '2020-01-15' """ - # Potential directives that datetime accepts that can return the correct year, month, day fields - # see https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes - # - # Allows us to check if year/month/day are included in the date format so we - # know when to mask incomplete dates with 'XX' - all_field_directives = {'%c', '%x', - ('%G', '%V', '%A'), ('%G', '%V', '%a'), ('%G', '%V', '%w'), ('%G', '%V', '%u') + # Set of directives that can be converted to complete date with year, month, and day + year_month_day_directives = { + # Locale's full date representation + ('%c',),('%x',), + # Dates with ISO 8601 week dates for year ('%G' is NOT interchangeable with '%Y'), ISO 8601 week ('%V'), and weekdays + ('%G', '%V', '%A'),('%G', '%V', '%a'),('%G', '%V', '%w'),('%G', '%V', '%u'), + # Dates with year, week, and weekday + ('%y', '%U', '%A'), ('%y', '%U', '%a'), ('%y', '%U', '%w'), ('%y', '%U', '%u'), + ('%y', '%W', '%A'), ('%y', '%W', '%a'), ('%y', '%W', '%w'), ('%y', '%W', '%u'), + ('%Y', '%U', '%A'), ('%Y', '%U', '%a'), ('%Y', '%U', '%w'), ('%Y', '%U', '%u'), + ('%Y', '%W', '%A'), ('%Y', '%W', '%a'), ('%Y', '%W', '%w'), ('%Y', '%W', '%u'), + # Dates with year and day of the year + ('%y', '%j'), ('%Y', '%j'), + # Dates with year, month, and day + ('%y', '%b', '%d'), ('%y', '%B', '%d'), ('%y', '%m', '%d'), + ('%Y', '%b', '%d'), ('%Y', '%B', '%d'), ('%Y', '%m', '%d'), + } + + # Set of directives that can be converted to incomplete dates, missing the day + year_month_directives = { + ('%y', '%b'), ('%y', '%B'), ('%y', '%m'), + ('%Y', '%b'), ('%Y', '%B'), ('%Y', '%m'), } - month_and_day_directives = {'%j', - ('%U', '%A'), ('%U', '%a'), ('%U', '%w'), ('%U', '%u'), - ('%W', '%A'), ('%W', '%a'), ('%W', '%w'), ('%W', '%u') + + # Set of directives that can be converted to incomplete dates, missing the month and day + year_directives = { + ('%y',), ('%Y',) } - year_directives = {'%y', '%Y'} - month_directives = {'%b', '%B', '%m'} - day_directives = {'%d'} for date_format in expected_formats: try: @@ -131,32 +141,23 @@ def format_date(date_string, expected_formats): parsed_month_string = str(parsed_date.month).zfill(2) parsed_day_string = str(parsed_date.day).zfill(2) - # If a directive for ALL fields is included in date format, + # If directives for all year,month,day fields are included in date_format, # then use all of the parsed field strings - if (directive_is_included(all_field_directives, date_format)): + if directive_is_included(year_month_day_directives, date_format): year_string = parsed_year_string month_string = parsed_month_string day_string = parsed_day_string - # If not all fields directives are included, then check year - # directive was included in date format - elif(directive_is_included(year_directives, date_format)): + # If directives only include year and month are included in date_format, + # then only use the parsed year and month field strings + elif directive_is_included(year_month_directives, date_format): year_string = parsed_year_string + month_string = parsed_month_string - # Only check for month and day directives if year is included - # Check if directive for BOTH month and year is included in date format - if (directive_is_included(month_and_day_directives, date_format)): - month_string = parsed_month_string - day_string = parsed_day_string - - # If not directives for BOTH month and day are included, then check - # month directive was included in date format - elif(directive_is_included(month_directives, date_format)): - month_string = parsed_month_string - - # Only check for day directives if month is included - if(directive_is_included(day_directives, date_format)): - day_string = parsed_day_string + # If directives only include year in date_format, the only use the + # parsed year field string + elif directive_is_included(year_directives, date_format): + year_string = parsed_year_string return f"{year_string}-{month_string}-{day_string}" From dfbb1311ee6fde4e2c01e1ba69b2a6ac03d9ee6e Mon Sep 17 00:00:00 2001 From: Jover Date: Mon, 12 Jun 2023 12:55:42 -0700 Subject: [PATCH 04/12] curate/format-dates: update directives Use `itertools.product` to more clearly see the possible values for each part, and put these constants in a separate file to avoid setting the same values upon every call to `format_date()`. Co-authored-by: Victor Lin <13424970+victorlin@users.noreply.github.com> --- augur/curate/format_dates.py | 35 +++---------------------- augur/curate/format_dates_directives.py | 28 ++++++++++++++++++++ 2 files changed, 32 insertions(+), 31 deletions(-) create mode 100644 augur/curate/format_dates_directives.py diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index 242dfd9db..dfc435e38 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -4,6 +4,7 @@ """ from datetime import datetime from augur.io.print import print_err +from .format_dates_directives import YEAR_DIRECTIVES, YEAR_MONTH_DIRECTIVES, YEAR_MONTH_DAY_DIRECTIVES def register_parser(parent_subparsers): @@ -98,34 +99,6 @@ def format_date(date_string, expected_formats): >>> format_date("2020-01-15T00:00:00Z", expected_formats) '2020-01-15' """ - # Set of directives that can be converted to complete date with year, month, and day - year_month_day_directives = { - # Locale's full date representation - ('%c',),('%x',), - # Dates with ISO 8601 week dates for year ('%G' is NOT interchangeable with '%Y'), ISO 8601 week ('%V'), and weekdays - ('%G', '%V', '%A'),('%G', '%V', '%a'),('%G', '%V', '%w'),('%G', '%V', '%u'), - # Dates with year, week, and weekday - ('%y', '%U', '%A'), ('%y', '%U', '%a'), ('%y', '%U', '%w'), ('%y', '%U', '%u'), - ('%y', '%W', '%A'), ('%y', '%W', '%a'), ('%y', '%W', '%w'), ('%y', '%W', '%u'), - ('%Y', '%U', '%A'), ('%Y', '%U', '%a'), ('%Y', '%U', '%w'), ('%Y', '%U', '%u'), - ('%Y', '%W', '%A'), ('%Y', '%W', '%a'), ('%Y', '%W', '%w'), ('%Y', '%W', '%u'), - # Dates with year and day of the year - ('%y', '%j'), ('%Y', '%j'), - # Dates with year, month, and day - ('%y', '%b', '%d'), ('%y', '%B', '%d'), ('%y', '%m', '%d'), - ('%Y', '%b', '%d'), ('%Y', '%B', '%d'), ('%Y', '%m', '%d'), - } - - # Set of directives that can be converted to incomplete dates, missing the day - year_month_directives = { - ('%y', '%b'), ('%y', '%B'), ('%y', '%m'), - ('%Y', '%b'), ('%Y', '%B'), ('%Y', '%m'), - } - - # Set of directives that can be converted to incomplete dates, missing the month and day - year_directives = { - ('%y',), ('%Y',) - } for date_format in expected_formats: try: @@ -143,20 +116,20 @@ def format_date(date_string, expected_formats): # If directives for all year,month,day fields are included in date_format, # then use all of the parsed field strings - if directive_is_included(year_month_day_directives, date_format): + if directive_is_included(YEAR_MONTH_DAY_DIRECTIVES, date_format): year_string = parsed_year_string month_string = parsed_month_string day_string = parsed_day_string # If directives only include year and month are included in date_format, # then only use the parsed year and month field strings - elif directive_is_included(year_month_directives, date_format): + elif directive_is_included(YEAR_MONTH_DIRECTIVES, date_format): year_string = parsed_year_string month_string = parsed_month_string # If directives only include year in date_format, the only use the # parsed year field string - elif directive_is_included(year_directives, date_format): + elif directive_is_included(YEAR_DIRECTIVES, date_format): year_string = parsed_year_string return f"{year_string}-{month_string}-{day_string}" diff --git a/augur/curate/format_dates_directives.py b/augur/curate/format_dates_directives.py new file mode 100644 index 000000000..0ad5b93bf --- /dev/null +++ b/augur/curate/format_dates_directives.py @@ -0,0 +1,28 @@ +from itertools import product + +year = {'%y', '%Y'} +month = {'%b', '%B', '%m'} +day = {'%d'} +month_and_day = {'%j'} +week = {'%U', '%W'} +day_of_week = {'%A', '%a', '%w', '%u'} + +# Set of directives that can be converted to complete date with year, month, and day +YEAR_MONTH_DAY_DIRECTIVES = ( + # Locale's full date representation + {('%c',),('%x',)} | + # Dates with ISO 8601 week dates for year ('%G' is NOT interchangeable with '%Y'), ISO 8601 week ('%V'), and weekdays + {('%G', '%V', '%A'),('%G', '%V', '%a'),('%G', '%V', '%w'),('%G', '%V', '%u')} | + # Dates with year, week, and weekday + set(product(year, week, day_of_week)) | + # Dates with year and day of the year + set(product(year, month_and_day)) | + # Dates with year, month, and day + set(product(year, month, day)) +) + +# Set of directives that can be converted to incomplete dates, missing the day +YEAR_MONTH_DIRECTIVES = set(product(year, month)) + +# Set of directives that can be converted to incomplete dates, missing the month and day +YEAR_DIRECTIVES = set(product(year)) From 140f4c2f878c7a0c0e062cc7a47c6a684545f335 Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 18 Jan 2023 17:46:15 -0800 Subject: [PATCH 05/12] directive_is_included: account for '%%' directives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit '%%' represents an escaped directive that is a literal '%' character, so it needs to be excluded in the date format check. Using the regex negative lookbehind assertion suggested by @tsibley in a previous review.¹ ¹ https://github.com/nextstrain/monkeypox/pull/45#discussion_r898485582 --- augur/curate/format_dates.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index dfc435e38..1a4d7e0ca 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -2,6 +2,7 @@ Format date fields to ISO 8601 dates (YYYY-MM-DD), where incomplete dates are masked with 'XX' (e.g. 2023 -> 2023-XX-XX). """ +import re from datetime import datetime from augur.io.print import print_err from .format_dates_directives import YEAR_DIRECTIVES, YEAR_MONTH_DIRECTIVES, YEAR_MONTH_DAY_DIRECTIVES @@ -49,13 +50,19 @@ def directive_is_included(potential_directives, date_format): False >>> directive_is_included(potential_directives, '%y-%m') False + >>> directive_is_included(potential_directives, '%%y-%m-%d') + False >>> directive_is_included(potential_directives, '%y-%m-%d') True >>> directive_is_included(potential_directives, '%y-%m-%dT%H:%M:%SZ') True """ return any( - all(sub_directive in date_format for sub_directive in directive) + all( + # Exclude escaped directives (e.g. '%%Y' means literal '%Y' not a four digit year) + bool(re.search(f"(? Date: Fri, 20 Jan 2023 17:54:26 -0800 Subject: [PATCH 06/12] curate format-dates: Add `--failure-reporting` Add `--failure-reporting` option to dictate how failed date formatting should be reported. Similar to the duplicate/unmatched-reporting options for curate inputs, the choices are `error_first`, `error_all`, `warn`, or `silent`. The `error_all` and `warn` options print summaries for all records that failed as `(record, field, date string)`. I chose this format to minimize the memory footprint while still providing enough info for users to debug issues. --- augur/curate/format_dates.py | 66 +++++++++++++++++---- tests/functional/curate/cram/format_dates.t | 47 +++++++++++++-- 2 files changed, 95 insertions(+), 18 deletions(-) diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index 1a4d7e0ca..fd6d50a9b 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -4,7 +4,10 @@ """ import re from datetime import datetime + +from augur.errors import AugurError from augur.io.print import print_err +from augur.types import DataErrorMethod from .format_dates_directives import YEAR_DIRECTIVES, YEAR_MONTH_DIRECTIVES, YEAR_MONTH_DAY_DIRECTIVES @@ -21,6 +24,14 @@ def register_parser(parent_subparsers): "defined by standard format codes as listed at " + "https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes. " + "If a date string matches multiple formats, it will be parsed as the first format in the list.") + + optional = parser.add_argument_group(title="OPTIONAL") + optional.add_argument("--failure-reporting", + type=DataErrorMethod.argtype, + choices=list(DataErrorMethod), + default=DataErrorMethod.ERROR_FIRST, + help="How should failed date formatting be reported.") + return parser @@ -81,9 +92,8 @@ def format_date(date_string, expected_formats): Returns ------- - str : - Formatted date string. - If *date_string* does not match *expected_formats*, returns original *date_string*. + str or None: + Formatted date string or None if the parsing of the date string failed. If *date_string* is an incomplete date, the date is masked with 'XX'. Dates without year will be formatted as 'XXXX-XX-XX', even if month/day are known. Dates without month will be formatted as 'YYYY-XX-XX', even if day is known. @@ -141,22 +151,52 @@ def format_date(date_string, expected_formats): return f"{year_string}-{month_string}-{day_string}" - if date_string: - print_err( - f"WARNING: Unable to transform date string {date_string!r} because it does not match", - f"any of the expected formats {expected_formats}." - ) - - return date_string + return None def run(args, records): - for record in records: + failures = [] + failure_reporting = args.failure_reporting + for index, record in enumerate(records): record = record.copy() + record_id = index for field in args.date_fields: date_string = record.get(field) - if date_string: - record[field] = format_date(date_string, args.expected_date_formats) + + if not date_string: + continue + + formatted_date_string = format_date(date_string, args.expected_date_formats) + if formatted_date_string is None: + + if failure_reporting is DataErrorMethod.SILENT: + continue + + failure_message = f"Unable to format date string {date_string!r} in field {field!r} of record {record_id!r}." + if failure_reporting is DataErrorMethod.ERROR_FIRST: + raise AugurError(failure_message) + + if failure_reporting is DataErrorMethod.WARN: + print_err(f"WARNING: {failure_message}") + + # Keep track of failures for final summary + failures.append((record_id, field, date_string)) + else: + record[field] = formatted_date_string yield record + + if failure_reporting is not DataErrorMethod.SILENT and failures: + failure_message = ( + "Unable to format dates for the following (record, field, date string):\n" + \ + '\n'.join(map(repr, failures)) + ) + if failure_reporting is DataErrorMethod.ERROR_ALL: + raise AugurError(failure_message) + + elif failure_reporting is DataErrorMethod.WARN: + print_err(f"WARNING: {failure_message}") + + else: + raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}") diff --git a/tests/functional/curate/cram/format_dates.t b/tests/functional/curate/cram/format_dates.t index a7d929f2a..ec521cea9 100644 --- a/tests/functional/curate/cram/format_dates.t +++ b/tests/functional/curate/cram/format_dates.t @@ -17,15 +17,52 @@ Test output with matching expected date formats. > --expected-date-formats "%Y" "%Y-%m" "%Y-%m-%dT%H:%M:%SZ" {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01-XX", "releaseDate": "2020-01-XX", "updateDate": "2020-07-18"} -Test output with unmatched expected date formats. -This is expected to output a warning and return the date strings in their original format. +Test output with unmatched expected date formats with default `ERROR_FIRST` failure reporting. +This is expected to fail with an error, so redirecting stdout since we don't care about the output. $ cat $TMP/records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ - > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" - WARNING: Unable to transform date string '2020-01' because it does not match any of the expected formats ['%Y', '%Y-%m-%dT%H:%M:%SZ']. - WARNING: Unable to transform date string '2020-01' because it does not match any of the expected formats ['%Y', '%Y-%m-%dT%H:%M:%SZ']. + > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" 1> /dev/null + ERROR: Unable to format date string '2020-01' in field 'collectionDate' of record 0. + [2] + +Test output with unmatched expected date formats with `ERROR_ALL` failure reporting. +This is expected to fail with an error, so redirecting stdout since we don't care about the output. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ + > --failure-reporting "error_all" 1> /dev/null + ERROR: Unable to format dates for the following (record, field, date string): + (0, 'collectionDate', '2020-01') + (0, 'releaseDate', '2020-01') + [2] + +Test output with unmatched expected date formats while warning on failures. +This is expected to print warnings for failures and return the date strings in their original format. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ + > --failure-reporting "warn" + WARNING: Unable to format date string '2020-01' in field 'collectionDate' of record 0. + WARNING: Unable to format date string '2020-01' in field 'releaseDate' of record 0. + WARNING: Unable to format dates for the following (record, field, date string): + (0, 'collectionDate', '2020-01') + (0, 'releaseDate', '2020-01') + {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01", "releaseDate": "2020-01", "updateDate": "2020-07-18"} + +Test output with unmatched expected date formats while silencing failures. +This is expected to return the date strings in their original format. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ + > --failure-reporting "silent" {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01", "releaseDate": "2020-01", "updateDate": "2020-07-18"} Test output with multiple matching expected date formats. From 0ca489614350a94902def92eb8ce53df105dd0ac Mon Sep 17 00:00:00 2001 From: Jover Date: Mon, 12 Jun 2023 13:28:00 -0700 Subject: [PATCH 07/12] curate: simplify uses of `DataErrorMethod` enum Now that `DataErrorMethod` is an `ArgparseEnum`, we no longer have to specify `value` for the argparse choices and we do not have to convert the argument strings back to enum members. Specify type as `DataErrorMethod.argtype` in order to get better error messages when the provided option value is not a valid choice. --- augur/curate/__init__.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index 2d0f30b83..fa44f9a37 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -64,12 +64,14 @@ def create_shared_parser(): help="The name to use for the sequence field when joining sequences from a FASTA file.") shared_inputs.add_argument("--unmatched-reporting", - choices=[ method.value for method in DataErrorMethod ], - default=DataErrorMethod.ERROR_FIRST.value, + type=DataErrorMethod.argtype, + choices=list(DataErrorMethod), + default=DataErrorMethod.ERROR_FIRST, help="How unmatched records from combined metadata/FASTA input should be reported.") shared_inputs.add_argument("--duplicate-reporting", - choices=[ method.value for method in DataErrorMethod ], - default=DataErrorMethod.ERROR_FIRST.value, + type=DataErrorMethod.argtype, + choices=list(DataErrorMethod), + default=DataErrorMethod.ERROR_FIRST, help="How should duplicate records be reported.") shared_outputs = shared_parser.add_argument_group( @@ -143,8 +145,8 @@ def run(args): args.fasta, args.seq_id_column, args.seq_field, - DataErrorMethod(args.unmatched_reporting), - DataErrorMethod(args.duplicate_reporting)) + args.unmatched_reporting, + args.duplicate_reporting) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " @@ -153,7 +155,7 @@ def run(args): ) elif args.metadata: try: - records = read_table_to_dict(args.metadata, args.metadata_delimiters, DataErrorMethod(args.duplicate_reporting), args.id_column) + records = read_table_to_dict(args.metadata, args.metadata_delimiters, args.duplicate_reporting, args.id_column) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " From 4d02220719b66b5f775dfe23d71e81a44dd69699 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 27 Jan 2023 17:01:59 -0800 Subject: [PATCH 08/12] curate formate-dates: Mask failed date strings By default, completely mask date strings with "XXXX-XX-XX" for dates that failed date formatting so that they are still in the proper ISO 8601 dates format for downstream Augur commands. Users can turn off the masking with the added `--no-mask-failure` option. Note the `store_false` actions produce semi-misleading docs in the help output, so the new option suppresses the default value in the help message with `SKIP_AUTO_DEFAULT_IN_HELP` as suggested by @tsibley in review. --- augur/argparse_.py | 14 ++++++++++++++ augur/curate/format_dates.py | 9 +++++++++ tests/functional/curate/cram/format_dates.t | 17 ++++++++++++++--- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/augur/argparse_.py b/augur/argparse_.py index b732963ee..175afa71e 100644 --- a/augur/argparse_.py +++ b/augur/argparse_.py @@ -4,6 +4,20 @@ from argparse import Action, ArgumentDefaultsHelpFormatter +# Include this in an argument help string to suppress the automatic appending +# of the default value by argparse.ArgumentDefaultsHelpFormatter. This works +# because the automatic appending is conditional on the presence of %(default), +# so we include it but then format it as a zero-length string .0s. 🙃 +# +# Another solution would be to add an extra attribute to the argument (the +# argparse.Action instance) and then subclass ArgumentDefaultsHelpFormatter to +# condition on that new attribute, but that seems more brittle. +# +# Copied from the Nextstrain CLI repo +# https://github.com/nextstrain/cli/blob/017c53805e8317951327d24c04184615cc400b09/nextstrain/cli/argparse.py#L13-L21 +SKIP_AUTO_DEFAULT_IN_HELP = "%(default).0s" + + def add_default_command(parser): """ Sets the default command to run when none is provided. diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index fd6d50a9b..003fcdf30 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -5,6 +5,7 @@ import re from datetime import datetime +from augur.argparse_ import SKIP_AUTO_DEFAULT_IN_HELP from augur.errors import AugurError from augur.io.print import print_err from augur.types import DataErrorMethod @@ -31,6 +32,10 @@ def register_parser(parent_subparsers): choices=list(DataErrorMethod), default=DataErrorMethod.ERROR_FIRST, help="How should failed date formatting be reported.") + optional.add_argument("--no-mask-failure", dest="mask_failure", + action="store_false", + help="Do not mask dates with 'XXXX-XX-XX' and return original date string if date formatting failed. " + + f"(default: False{SKIP_AUTO_DEFAULT_IN_HELP})") return parser @@ -169,6 +174,10 @@ def run(args, records): formatted_date_string = format_date(date_string, args.expected_date_formats) if formatted_date_string is None: + # Mask failed date formatting before processing error methods + # to ensure failures are masked even when failures are "silent" + if args.mask_failure: + record[field] = "XXXX-XX-XX" if failure_reporting is DataErrorMethod.SILENT: continue diff --git a/tests/functional/curate/cram/format_dates.t b/tests/functional/curate/cram/format_dates.t index ec521cea9..d686ef2ac 100644 --- a/tests/functional/curate/cram/format_dates.t +++ b/tests/functional/curate/cram/format_dates.t @@ -41,7 +41,7 @@ This is expected to fail with an error, so redirecting stdout since we don't car [2] Test output with unmatched expected date formats while warning on failures. -This is expected to print warnings for failures and return the date strings in their original format. +This is expected to print warnings for failures and return the masked date strings for failures. $ cat $TMP/records.ndjson \ > | ${AUGUR} curate format-dates \ @@ -53,16 +53,27 @@ This is expected to print warnings for failures and return the date strings in t WARNING: Unable to format dates for the following (record, field, date string): (0, 'collectionDate', '2020-01') (0, 'releaseDate', '2020-01') - {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01", "releaseDate": "2020-01", "updateDate": "2020-07-18"} + {"record": 1, "date": "2020-XX-XX", "collectionDate": "XXXX-XX-XX", "releaseDate": "XXXX-XX-XX", "updateDate": "2020-07-18"} Test output with unmatched expected date formats while silencing failures. -This is expected to return the date strings in their original format. +This is expected to return the masked date strings for failures. $ cat $TMP/records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ > --failure-reporting "silent" + {"record": 1, "date": "2020-XX-XX", "collectionDate": "XXXX-XX-XX", "releaseDate": "XXXX-XX-XX", "updateDate": "2020-07-18"} + +Test output with unmatched expected date formats while silencing failures with `--no-mask-failure`. +This is expected to return the date strings in their original format. + + $ cat $TMP/records.ndjson \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ + > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ + > --failure-reporting "silent" \ + > --no-mask-failure {"record": 1, "date": "2020-XX-XX", "collectionDate": "2020-01", "releaseDate": "2020-01", "updateDate": "2020-07-18"} Test output with multiple matching expected date formats. From 20670a91bd98e698db5ca8058cfc2e64462b18d8 Mon Sep 17 00:00:00 2001 From: Jover Date: Tue, 31 Jan 2023 13:44:33 -0800 Subject: [PATCH 09/12] curate: Update Cram tests according to #1133 --- tests/functional/curate/cram/_setup.sh | 1 + tests/functional/curate/cram/format_dates.t | 21 +++++----- .../curate/cram/metadata-and-fasta-input.t | 41 +++++++++---------- .../curate/cram/metadata-and-fasta-output.t | 39 +++++++++--------- tests/functional/curate/cram/metadata-input.t | 27 ++++++------ .../functional/curate/cram/metadata-output.t | 13 +++--- .../curate/cram/normalize_strings.t | 13 +++--- tests/functional/curate/cram/passthru.t | 7 ++-- 8 files changed, 78 insertions(+), 84 deletions(-) create mode 100644 tests/functional/curate/cram/_setup.sh diff --git a/tests/functional/curate/cram/_setup.sh b/tests/functional/curate/cram/_setup.sh new file mode 100644 index 000000000..032447690 --- /dev/null +++ b/tests/functional/curate/cram/_setup.sh @@ -0,0 +1 @@ +export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" diff --git a/tests/functional/curate/cram/format_dates.t b/tests/functional/curate/cram/format_dates.t index d686ef2ac..a785172a4 100644 --- a/tests/functional/curate/cram/format_dates.t +++ b/tests/functional/curate/cram/format_dates.t @@ -1,17 +1,16 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ export AUGUR="${AUGUR:-../../../../bin/augur}" + $ source "$TESTDIR"/_setup.sh Create NDJSON file for testing format_dates with different forms - $ cat >$TMP/records.ndjson <<~~ + $ cat >records.ndjson <<~~ > {"record": 1, "date": "2020", "collectionDate": "2020-01", "releaseDate": "2020-01","updateDate": "2020-07-18T00:00:00Z"} > ~~ Test output with matching expected date formats. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m" "%Y-%m-%dT%H:%M:%SZ" @@ -20,7 +19,7 @@ Test output with matching expected date formats. Test output with unmatched expected date formats with default `ERROR_FIRST` failure reporting. This is expected to fail with an error, so redirecting stdout since we don't care about the output. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" 1> /dev/null @@ -30,7 +29,7 @@ This is expected to fail with an error, so redirecting stdout since we don't car Test output with unmatched expected date formats with `ERROR_ALL` failure reporting. This is expected to fail with an error, so redirecting stdout since we don't care about the output. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ @@ -43,7 +42,7 @@ This is expected to fail with an error, so redirecting stdout since we don't car Test output with unmatched expected date formats while warning on failures. This is expected to print warnings for failures and return the masked date strings for failures. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ @@ -58,7 +57,7 @@ This is expected to print warnings for failures and return the masked date strin Test output with unmatched expected date formats while silencing failures. This is expected to return the masked date strings for failures. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ @@ -68,7 +67,7 @@ This is expected to return the masked date strings for failures. Test output with unmatched expected date formats while silencing failures with `--no-mask-failure`. This is expected to return the date strings in their original format. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m-%dT%H:%M:%SZ" \ @@ -80,7 +79,7 @@ Test output with multiple matching expected date formats. Date with multiple matches will be parsed according to first matching format. The "collectionDate" and "releaseDate" will match the first "%Y-%j" format, which is a complete date. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "collectionDate" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%j" "%Y-%m" "%Y-%m-%dT%H:%M:%SZ" @@ -90,7 +89,7 @@ Test output with chained format-dates commands that parses different fields with Since "collectionDate" and "releaseDate" have expected formats overlap, we can split them into two chained commands that parses them with different expected formats to produce the desired results. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate format-dates \ > --date-fields "date" "releaseDate" "updateDate" \ > --expected-date-formats "%Y" "%Y-%m" "%Y-%m-%dT%H:%M:%SZ" \ diff --git a/tests/functional/curate/cram/metadata-and-fasta-input.t b/tests/functional/curate/cram/metadata-and-fasta-input.t index 6817de7cf..84b6d7b11 100644 --- a/tests/functional/curate/cram/metadata-and-fasta-input.t +++ b/tests/functional/curate/cram/metadata-and-fasta-input.t @@ -1,14 +1,13 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ export AUGUR="${AUGUR:-../../../../bin/augur}" + $ source "$TESTDIR"/_setup.sh Testing combined metadata and FASTA inputs for the curate command. Running the `passthru` subcommand since it does not do any data transformations. Create FASTA file for testing. - $ cat >$TMP/sequences.fasta <<~~ + $ cat >sequences.fasta <<~~ > >sequence_A > ATCG > >sequence_B @@ -19,7 +18,7 @@ Create FASTA file for testing. Create metadata TSV file for testing. - $ cat >$TMP/metadata.tsv <<~~ + $ cat >metadata.tsv <<~~ > strain country date > sequence_A USA 2020-10-01 > sequence_B USA 2020-10-02 @@ -30,7 +29,7 @@ Test metadata input with extra FASTA input options without a FASTA file. This is expected to fail with an error. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv \ + > --metadata metadata.tsv \ > --seq-id-column name \ > --seq-field sequences ERROR: The --seq-id-column and --seq-field options should only be used when providing a FASTA file. @@ -41,16 +40,16 @@ Test metadata and FASTA inputs without required FASTA input options. This is expected to fail with an error. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv \ - > --fasta $TMP/sequences.fasta + > --metadata metadata.tsv \ + > --fasta sequences.fasta ERROR: The --seq-id-column and --seq-field options are required for a FASTA file input. [2] Test metadata and FASTA inputs with required FASTA input options. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv \ - > --fasta $TMP/sequences.fasta \ + > --metadata metadata.tsv \ + > --fasta sequences.fasta \ > --seq-id-column strain \ > --seq-field seq {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "seq": "ATCG"} @@ -59,8 +58,8 @@ Test metadata and FASTA inputs with required FASTA input options. Create new metadata file with duplicate and extra metadata records. - $ cp $TMP/metadata.tsv $TMP/metadata-with-duplicate-and-unmatched-records.tsv - $ cat >>$TMP/metadata-with-duplicate-and-unmatched-records.tsv <<~~ + $ cp metadata.tsv metadata-with-duplicate-and-unmatched-records.tsv + $ cat >>metadata-with-duplicate-and-unmatched-records.tsv <<~~ > sequence_A USA 2020-10-XX > extra_metadata_A USA 2020-10-01 > extra_metadata_B USA 2020-10-02 @@ -68,8 +67,8 @@ Create new metadata file with duplicate and extra metadata records. Create new FASTA file with duplicate and extra sequence records. - $ cp $TMP/sequences.fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta - $ cat >>$TMP/sequences-with-duplicate-and-unmatched-records.fasta <<~~ + $ cp sequences.fasta sequences-with-duplicate-and-unmatched-records.fasta + $ cat >>sequences-with-duplicate-and-unmatched-records.fasta <<~~ > >sequence_A > NNNN > >extra_sequence_A @@ -82,8 +81,8 @@ Test metadata and FASTA inputs with duplicate and extra records and default `ERR This is expected to fail with an error, so redirecting stdout since we don't care about the output. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \ - > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \ + > --metadata metadata-with-duplicate-and-unmatched-records.tsv \ + > --fasta sequences-with-duplicate-and-unmatched-records.fasta \ > --seq-id-column strain \ > --seq-field seq 1> /dev/null ERROR: Encountered sequence record with duplicate id 'sequence_A'. @@ -93,8 +92,8 @@ Test metadata and FASTA inputs with duplicate and extra records with `ERROR_ALL` This is expected to fail with an error, so redirecting stdout since we don't care about the output. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \ - > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \ + > --metadata metadata-with-duplicate-and-unmatched-records.tsv \ + > --fasta sequences-with-duplicate-and-unmatched-records.fasta \ > --seq-id-column strain \ > --seq-field seq \ > --unmatched-reporting error_all \ @@ -119,8 +118,8 @@ This is expected run without error and only print a warning. Notice the duplicate sequence "sequence_A" will always use the first sequence in the FASTA file because of pyfastx. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \ - > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \ + > --metadata metadata-with-duplicate-and-unmatched-records.tsv \ + > --fasta sequences-with-duplicate-and-unmatched-records.fasta \ > --seq-id-column strain \ > --seq-field seq \ > --unmatched-reporting warn \ @@ -150,8 +149,8 @@ Test metadata and FASTA inputs with unmatched records in both, but ask to silent Notice the duplicate sequence "sequence_A" will always use the first sequence in the FASTA file because of pyfastx. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata-with-duplicate-and-unmatched-records.tsv \ - > --fasta $TMP/sequences-with-duplicate-and-unmatched-records.fasta \ + > --metadata metadata-with-duplicate-and-unmatched-records.tsv \ + > --fasta sequences-with-duplicate-and-unmatched-records.fasta \ > --seq-id-column strain \ > --seq-field seq \ > --unmatched-reporting silent \ diff --git a/tests/functional/curate/cram/metadata-and-fasta-output.t b/tests/functional/curate/cram/metadata-and-fasta-output.t index ab4afec2b..2036e4d2b 100644 --- a/tests/functional/curate/cram/metadata-and-fasta-output.t +++ b/tests/functional/curate/cram/metadata-and-fasta-output.t @@ -1,14 +1,13 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ export AUGUR="${AUGUR:-../../../../bin/augur}" + $ source "$TESTDIR"/_setup.sh Testing combined metadata and FASTA output for the curate command. Running the `passthru` subcommand since it does not do any data transformations. Create NDJSON file for testing. - $ cat >$TMP/records.ndjson <<~~ + $ cat >records.ndjson <<~~ > {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "sequence": "AAAA"} > {"strain": "sequence_T", "country": "USA", "date": "2020-10-02", "sequence": "TTTT"} > {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "sequence": "CCCC"} @@ -16,9 +15,9 @@ Create NDJSON file for testing. Test metadata output with extra FASTA output options. This is expected to fail immediately with an error. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ - > --output-metadata $TMP/metadata.tsv \ + > --output-metadata metadata.tsv \ > --output-id-field strain \ > --output-seq-field sequence ERROR: The --output-id-field and --output-seq-field options should only be used when requesting a FASTA output. @@ -26,27 +25,27 @@ This is expected to fail immediately with an error. Test metadata and FASTA outputs without requried FASTA output options. This is expected to fail immediately with an error. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ - > --output-metadata $TMP/metadata.tsv \ - > --output-fasta $TMP/sequences.fasta + > --output-metadata metadata.tsv \ + > --output-fasta sequences.fasta ERROR: The --output-id-field and --output-seq-field options are required for a FASTA output. [2] Test metadata and FASTA outputs - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ - > --output-metadata $TMP/metadata.tsv \ - > --output-fasta $TMP/sequences.fasta \ + > --output-metadata metadata.tsv \ + > --output-fasta sequences.fasta \ > --output-id-field strain \ > --output-seq-field sequence - $ cat $TMP/metadata.tsv + $ cat metadata.tsv strain\tcountry\tdate (esc) sequence_A\tUSA\t2020-10-01 (esc) sequence_T\tUSA\t2020-10-02 (esc) sequence_C\tUSA\t2020-10-03 (esc) - $ cat $TMP/sequences.fasta + $ cat sequences.fasta >sequence_A (esc) AAAA (esc) >sequence_T (esc) @@ -56,12 +55,12 @@ Test metadata and FASTA outputs Test FASTA output without metadata output. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ - > --output-fasta $TMP/sequences.fasta \ + > --output-fasta sequences.fasta \ > --output-id-field strain \ > --output-seq-field sequence - $ cat $TMP/sequences.fasta + $ cat sequences.fasta >sequence_A (esc) AAAA (esc) >sequence_T (esc) @@ -72,9 +71,9 @@ Test FASTA output without metadata output. Test FASTA output with bad output id field. This is expected to fail with an error. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ - > --output-fasta $TMP/sequences.fasta \ + > --output-fasta sequences.fasta \ > --output-id-field bogus_id \ > --output-seq-field sequence ERROR: Provided sequence identifier field 'bogus_id' does not exist. @@ -83,9 +82,9 @@ This is expected to fail with an error. Test FASTA output with bad output sequence field. This is expected to fail with an error. - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ - > --output-fasta $TMP/sequences.fasta \ + > --output-fasta sequences.fasta \ > --output-id-field strain \ > --output-seq-field bogus_sequence ERROR: Provided sequence field 'bogus_sequence' does not exist. diff --git a/tests/functional/curate/cram/metadata-input.t b/tests/functional/curate/cram/metadata-input.t index b08ec828b..901b4fdff 100644 --- a/tests/functional/curate/cram/metadata-input.t +++ b/tests/functional/curate/cram/metadata-input.t @@ -1,14 +1,13 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ export AUGUR="${AUGUR:-../../../../bin/augur}" + $ source "$TESTDIR"/_setup.sh Testing metadata inputs for the curate command. Running the `passthru` subcommand since it does not do any data transformations. Create metadata TSV file for testing. - $ cat >$TMP/metadata.tsv <<~~ + $ cat >metadata.tsv <<~~ > strain country date authors > sequence_A USA 2020-10-01 A,B,C,D,E,F,G,H,I,J,K > sequence_B USA 2020-10-02 A,B,C,D,E,F,G,H,I,J,K @@ -18,14 +17,14 @@ Create metadata TSV file for testing. Test TSV metadata input $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv + > --metadata metadata.tsv {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} Test TSV metadata input from stdin - $ cat $TMP/metadata.tsv \ + $ cat metadata.tsv \ > | ${AUGUR} curate normalize-strings \ > --metadata - {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} @@ -34,7 +33,7 @@ Test TSV metadata input from stdin Create metadata CSV file for testing. - $ cat >$TMP/metadata.csv <<~~ + $ cat >metadata.csv <<~~ > strain,country,date > sequence_A,USA,2020-10-01 > sequence_B,USA,2020-10-02 @@ -44,14 +43,14 @@ Create metadata CSV file for testing. Test CSV metadata input $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.csv + > --metadata metadata.csv {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"} {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"} {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"} Test CSV metadata input from stdin - $ cat $TMP/metadata.csv \ + $ cat metadata.csv \ > | ${AUGUR} curate normalize-strings \ > --metadata - {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"} @@ -61,7 +60,7 @@ Test CSV metadata input from stdin Create a metadata TSV file with duplicate records - $ cat >$TMP/metadata.tsv <<~~ + $ cat >metadata.tsv <<~~ > strain country date > sequence_A USA 2020-10-01 > sequence_B USA 2020-10-02 @@ -75,7 +74,7 @@ Test default options for duplicate records, which is expected for exit with an e There will still be output due to the nature of the chained generators in augur curate. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv + > --metadata metadata.tsv ERROR: Encountered record with duplicate id 'sequence_A' in .* (re) {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"} {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"} @@ -85,7 +84,7 @@ There will still be output due to the nature of the chained generators in augur Test error_all on duplicate records. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv \ + > --metadata metadata.tsv \ > --duplicate-reporting error_all ERROR: The following records are duplicated in .* (re) 'sequence_A' @@ -102,7 +101,7 @@ Test error_all on duplicate records. Test warning on duplicate records. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv \ + > --metadata metadata.tsv \ > --duplicate-reporting warn WARNING: Encountered record with duplicate id 'sequence_A' in .* (re) WARNING: Encountered record with duplicate id 'sequence_B' in .* (re) @@ -121,7 +120,7 @@ Test warning on duplicate records. Test silent on duplicate records. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv \ + > --metadata metadata.tsv \ > --duplicate-reporting silent {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"} {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"} @@ -133,7 +132,7 @@ Test silent on duplicate records. Test duplicate records with a bogus id column, which is expected to fail with an error. $ ${AUGUR} curate passthru \ - > --metadata $TMP/metadata.tsv \ + > --metadata metadata.tsv \ > --id-column "bogus_id" ERROR: The provided id column 'bogus_id' does not exist in .* (re) [2] diff --git a/tests/functional/curate/cram/metadata-output.t b/tests/functional/curate/cram/metadata-output.t index 4c4c46285..00add745f 100644 --- a/tests/functional/curate/cram/metadata-output.t +++ b/tests/functional/curate/cram/metadata-output.t @@ -1,29 +1,28 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ export AUGUR="${AUGUR:-../../../../bin/augur}" + $ source "$TESTDIR"/_setup.sh Testing metadata output for the curate command. Running the `passthru` subcommand since it does not do any data transformations. Create NDJSON file for testing. - $ cat >$TMP/records.ndjson <<~~ + $ cat >records.ndjson <<~~ > {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"} > {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"} > {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"} > ~~ Test metadata output TSV - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ - > --output-metadata $TMP/metadata.tsv - $ cat $TMP/metadata.tsv + > --output-metadata metadata.tsv + $ cat metadata.tsv strain\tcountry\tdate (esc) sequence_A\tUSA\t2020-10-01 (esc) sequence_B\tUSA\t2020-10-02 (esc) sequence_C\tUSA\t2020-10-03 (esc) Test metadata output TSV to stdout - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate passthru \ > --output-metadata - strain\tcountry\tdate (esc) diff --git a/tests/functional/curate/cram/normalize_strings.t b/tests/functional/curate/cram/normalize_strings.t index e9c8cb698..eb779e2c3 100644 --- a/tests/functional/curate/cram/normalize_strings.t +++ b/tests/functional/curate/cram/normalize_strings.t @@ -1,7 +1,6 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ export AUGUR="${AUGUR:-../../../../bin/augur}" + $ source "$TESTDIR"/_setup.sh Test two versions of C-cedilla that look the same visually but have different code points, therefore are considered "Not equal". @@ -13,30 +12,30 @@ have different code points, therefore are considered "Not equal". Create NDJSON file for testing normalize-strings with different forms - $ cat >$TMP/records.ndjson <<~~ + $ cat >records.ndjson <<~~ > {"record": 1, "diacritic_1": "${DIACRITIC_1}", "diacritic_2": "${DIACRITIC_2}"} > ~~ Test output with default Unicode normalization form "NFC". - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate normalize-strings {"record": 1, "diacritic_1": "\u00c7", "diacritic_2": "\u00c7"} Test output with Unicode normalization form "NFKC". - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate normalize-strings --form NFKC {"record": 1, "diacritic_1": "\u00c7", "diacritic_2": "\u00c7"} Test output with Unicode normalization form "NFD". - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate normalize-strings --form NFD {"record": 1, "diacritic_1": "C\u0327", "diacritic_2": "C\u0327"} Test output with Unicode normalization form "NFKD". - $ cat $TMP/records.ndjson \ + $ cat records.ndjson \ > | ${AUGUR} curate normalize-strings --form NFKD {"record": 1, "diacritic_1": "C\u0327", "diacritic_2": "C\u0327"} diff --git a/tests/functional/curate/cram/passthru.t b/tests/functional/curate/cram/passthru.t index c736c5c9a..65125670c 100644 --- a/tests/functional/curate/cram/passthru.t +++ b/tests/functional/curate/cram/passthru.t @@ -1,15 +1,14 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ export AUGUR="${AUGUR:-../../../../bin/augur}" + $ source "$TESTDIR"/_setup.sh Create NDJSON file for testing all valid JSON data types. - $ cat >$TMP/records.ndjson <<~~ + $ cat >records.ndjson <<~~ > {"string": "string", "number": 123, "object": {"string": "string"}, "array": ["string0", "string1", "string2"], "boolean1": true, "boolean2": false, "null": null} > ~~ Output should be exactly the same as the input. - $ cat $TMP/records.ndjson | ${AUGUR} curate passthru + $ cat records.ndjson | ${AUGUR} curate passthru {"string": "string", "number": 123, "object": {"string": "string"}, "array": ["string0", "string1", "string2"], "boolean1": true, "boolean2": false, "null": null} From d1488e7e00a6b6914bb4e48cbd1461f195ac5722 Mon Sep 17 00:00:00 2001 From: Jover Date: Tue, 31 Jan 2023 14:38:18 -0800 Subject: [PATCH 10/12] docs: Add usage docs for `curate format-dates` --- docs/usage/cli/curate/format-dates.rst | 9 +++++++++ docs/usage/cli/curate/index.rst | 1 + 2 files changed, 10 insertions(+) create mode 100644 docs/usage/cli/curate/format-dates.rst diff --git a/docs/usage/cli/curate/format-dates.rst b/docs/usage/cli/curate/format-dates.rst new file mode 100644 index 000000000..331e22fae --- /dev/null +++ b/docs/usage/cli/curate/format-dates.rst @@ -0,0 +1,9 @@ +============ +format-dates +============ + +.. argparse:: + :module: augur + :func: make_parser + :prog: augur + :path: curate format-dates diff --git a/docs/usage/cli/curate/index.rst b/docs/usage/cli/curate/index.rst index 95b24e62b..a907a77e1 100644 --- a/docs/usage/cli/curate/index.rst +++ b/docs/usage/cli/curate/index.rst @@ -17,5 +17,6 @@ We will continue to add more subcommands as we identify other common data curati :maxdepth: 1 normalize-strings + format-dates passthru From ab1d9f92d80339d27366af26edb209d03b8ad9da Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 12 Jun 2023 13:53:19 -0700 Subject: [PATCH 11/12] curate/format-dates: Clarify help docs --- augur/curate/format_dates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index 003fcdf30..c7ccef72b 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -24,7 +24,7 @@ def register_parser(parent_subparsers): help="Expected date formats that are currently in the provided date fields, " + "defined by standard format codes as listed at " + "https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes. " + - "If a date string matches multiple formats, it will be parsed as the first format in the list.") + "If a date string matches multiple formats, it will be parsed as the first matched format in the provided order.") optional = parser.add_argument_group(title="OPTIONAL") optional.add_argument("--failure-reporting", From b4d390d0cf7249f50a07a98ef2d1affc249aa711 Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 6 Jul 2023 17:15:18 -0700 Subject: [PATCH 12/12] Update changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index e91dd5d32..ce6259a5a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,12 +5,14 @@ ### Features * export, frequencies, refine, traits: Add a new flag `--metadata-id-columns` to customize the possible metadata ID columns. Previously, this was only available in `augur filter`. [#1240][] (@victorlin) +* Add new sub-subcommand augur curate format-dates. The format-dates command is intended to be used to format date fields to ISO 8601 date format (YYYY-MM-DD), where incomplete dates are masked with `XX` (e.g. 2023 -> 2023-XX-XX). [#1146][] (@joverlee521) ### Bug fixes * parse: Fix a bug where `--fix-dates` was always applied, with a default of `--fix-dates=monthfirst`. Now, running without `--fix-dates` will leave dates as-is. [#1247][] (@victorlin) * `augur.io.open_file`: Previously, the docs described a type restriction on `path_or_buffer` but it was not enforced. It has been updated to allow all I/O classes, and is enforced at run-time. [#1250][] (@victorlin) +[#1146]: https://github.com/nextstrain/augur/pull/1146 [#1240]: https://github.com/nextstrain/augur/pull/1240 [#1247]: https://github.com/nextstrain/augur/issues/1247 [#1250]: https://github.com/nextstrain/augur/pull/1250