Adds augur curate titlecase sub-command

Adds a new sub-command `augur curate titlecase` based on the transform-string-fields script in the monkeypox repo. The `augur curate normalize` sub-command has already been added based on the same script (#1039). Overall this is part of filling in the gaps in the augur curate suite of commands (#860), specifically addressing issue (#999), and is a follow-up to #1039. `augur curate titlecase` would transform the values of a given metadata field to titlecase. This is useful for normalizing the values of a string that may contain inconsistent capitalization such as "North America" and "north america". Co-authored-by: Jover Lee <joverlee521@gmail.com>
nextstrain · Apr 19, 2023 · df24d0c · df24d0c
1 parent e6ff9ef
commit df24d0c
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 1 deletion.
diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
@@ -12,13 +12,14 @@
 from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
 from augur.io.sequences import write_records_to_fasta
 from augur.types import DataErrorMethod
-from . import normalize_strings, passthru
+from . import normalize_strings, passthru, titlecase
 
 
 SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
 SUBCOMMANDS = [
     passthru,
     normalize_strings,
+    titlecase,
 ]
 
 

diff --git a/augur/curate/titlecase.py b/augur/curate/titlecase.py
@@ -0,0 +1,124 @@
+"""
+Applies titlecase to string fields in a metadata record
+"""
+import argparse
+
+import re
+from typing import Optional, Set, Union
+
+from augur.errors import AugurError
+from augur.io.print import print_err
+from augur.types import DataErrorMethod
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("titlecase",
+    parents = [parent_subparsers.shared_parser],
+    help = __doc__)
+
+    required = parser.add_argument_group(title="REQUIRED")
+    required.add_argument("--titlecase-fields", nargs="*",
+        help="List of fields to convert to titlecase.", required=True)
+
+    optional = parser.add_argument_group(title="OPTIONAL")
+    optional.add_argument("--articles", nargs="*",
+        help="List of articles that should not be converted to titlecase.")
+    optional.add_argument("--abbreviations", nargs="*",
+        help="List of abbreviations that should not be converted to titlecase, keeps uppercase.")
+
+    optional.add_argument("--failure-reporting",
+        type=DataErrorMethod,
+        choices=[ method for method in DataErrorMethod ],
+        default=DataErrorMethod.ERROR_FIRST,
+        help="How should failed titlecase formatting be reported.")
+    return parser
+
+
+def titlecase(text: Union[str, None], articles: Set[str] = {}, abbreviations: Set[str] = {}) -> Optional[str]:
+    """
+    Originally from nextstrain/ncov-ingest
+
+    Returns a title cased location name from the given location name
+    *tokens*. Ensures that no tokens contained in the *whitelist_tokens* are
+    converted to title case.
+
+    >>> articles = {'a', 'and', 'of', 'the', 'le'}
+    >>> abbreviations = {'USA', 'DC'}
+
+    >>> titlecase("the night OF THE LIVING DEAD", articles)
+    'The Night of the Living Dead'
+
+    >>> titlecase("BRAINE-LE-COMTE, FRANCE", articles)
+    'Braine-le-Comte, France'
+
+    >>> titlecase("auvergne-RHÔNE-alpes", articles)
+    'Auvergne-Rhône-Alpes'
+
+    >>> titlecase("washington DC, usa", articles, abbreviations)
+    'Washington DC, USA'
+    """
+    if not isinstance(text, str):
+        return None
+
+    words = enumerate(re.split(r'\b', text))
+
+    def changecase(index, word):
+        casefold = word.casefold()
+        upper = word.upper()
+
+        if upper in abbreviations:
+            return upper
+        elif casefold in articles and index != 1:
+            return word.lower()
+        else:
+            return word.title()
+
+    return ''.join(changecase(i, w) for i, w in words)
+
+
+def run(args, records):
+    failures = []
+    failure_reporting = args.failure_reporting
+
+    articles = set()
+    if args.articles:
+        articles = set(args.articles)
+
+    abbreviations = set()
+    if args.abbreviations:
+        abbreviations = set(args.abbreviations)
+
+    for index, record in enumerate(records):
+        record = record.copy()
+        record_id = index
+
+        for field in args.titlecase_fields:
+            titlecased_string = titlecase(record.get(field, ""), articles, abbreviations)
+
+            failure_message = f"Failed to titlecase {field} in record {record_id}"
+            if titlecased_string is None:
+                if failure_reporting is DataErrorMethod.ERROR_FIRST:
+                    raise AugurError(failure_message)
+
+                if failure_reporting is DataErrorMethod.WARN:
+                    print_err(f"WARNING: {failure_message}")
+
+                    # Keep track of failures for final summary
+                    failures.append((record_id, field, record.get(field, "")))
+            else:
+                record[field] = titlecased_string
+
+        yield record
+
+    if failure_reporting is not DataErrorMethod.SILENT and failures:
+        failure_message = (
+            "Unable to change to titlecase for the following (record, field string):\n" + \
+            '\n'.join(map(repr, failures))
+        )
+        if failure_reporting is DataErrorMethod.ERROR_ALL:
+            raise AugurError(failure_message)
+
+        elif failure_reporting is DataErrorMethod.WARN:
+            print_err(f"WARNING: {failure_message}")
+
+        else:
+            raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}")
diff --git a/tests/functional/curate/cram/titlecase.t b/tests/functional/curate/cram/titlecase.t
@@ -0,0 +1,18 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+
+Create NDJSON file for testing titlecase with different forms
+
+  $ cat >$TMP/records.ndjson <<~~
+  > {"record": 1, "authors": "john smith", "author2": "Jane Doe"}
+  > ~~
+
+
+Test output with Unicode normalization form "NFKC".
+
+  $ cat $TMP/records.ndjson \
+  >   | ${AUGUR} curate titlecase --titlecase-fields "authors" "author2"
+  {"record": 1, "authors": "John Smith", "author2": "Jane Doe"}