[titlecase] Adds augur curate titlecase sub-command

Adds a new sub-command `augur curate titlecase` based on the transform-string-fields script in the monkeypox repo. The `augur curate normalize` sub-command has already been added based on the same script (#1039). Overall this is part of filling in the gaps in the augur curate suite of commands (#860), specifically addressing issue (#999), and is a follow-up to #1039. `augur curate titlecase` would transform the values of a given metadata field to titlecase. This is useful for normalizing the values of a string that may contain inconsistent capitalization such as "North America" and "north america". This commit also adds a test for the new sub-command and updates the documentation. For testing an upper case to lower case circumflex'd o character conversion, had to use the escaped unicode character Co-authored-by: Jover Lee <joverlee521@gmail.com>
nextstrain · Jul 14, 2023 · bcea80c · bcea80c
1 parent 9ef4711
commit bcea80c
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 1 deletion.
diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
@@ -12,14 +12,15 @@
 from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
 from augur.io.sequences import write_records_to_fasta
 from augur.types import DataErrorMethod
-from . import format_dates, normalize_strings, passthru
+from . import format_dates, normalize_strings, passthru, titlecase
 
 
 SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
 SUBCOMMANDS = [
     passthru,
     normalize_strings,
     format_dates,
+    titlecase,
 ]
 
 

diff --git a/augur/curate/titlecase.py b/augur/curate/titlecase.py
@@ -0,0 +1,122 @@
+"""
+Applies titlecase to string fields in a metadata record
+"""
+import re
+from typing import Optional, Set, Union
+
+from augur.errors import AugurError
+from augur.io.print import print_err
+from augur.types import DataErrorMethod
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("titlecase",
+    parents = [parent_subparsers.shared_parser],
+    help = __doc__)
+
+    required = parser.add_argument_group(title="REQUIRED")
+    required.add_argument("--titlecase-fields", nargs="*",
+        help="List of fields to convert to titlecase.", required=True)
+
+    optional = parser.add_argument_group(title="OPTIONAL")
+    optional.add_argument("--articles", nargs="*",
+        help="List of articles that should not be converted to titlecase.")
+    optional.add_argument("--abbreviations", nargs="*",
+        help="List of abbreviations that should not be converted to titlecase, keeps uppercase.")
+
+    optional.add_argument("--failure-reporting",
+        type=DataErrorMethod.argtype,
+        choices=[ method for method in DataErrorMethod ],
+        default=DataErrorMethod.ERROR_FIRST,
+        help="How should failed titlecase formatting be reported.")
+    return parser
+
+
+def titlecase(text: Union[str, None], articles: Set[str] = {}, abbreviations: Set[str] = {}) -> Optional[str]:
+    """
+    Originally from nextstrain/ncov-ingest
+
+    Returns a title cased location name from the given location name
+    *tokens*. Ensures that no tokens contained in the *whitelist_tokens* are
+    converted to title case.
+
+    >>> articles = {'a', 'and', 'of', 'the', 'le'}
+    >>> abbreviations = {'USA', 'DC'}
+
+    >>> titlecase("the night OF THE LIVING DEAD", articles)
+    'The Night of the Living Dead'
+
+    >>> titlecase("BRAINE-LE-COMTE, FRANCE", articles)
+    'Braine-le-Comte, France'
+
+    >>> titlecase("auvergne-RHÔNE-alpes", articles)
+    'Auvergne-Rhône-Alpes'
+
+    >>> titlecase("washington DC, usa", articles, abbreviations)
+    'Washington DC, USA'
+    """
+    if not isinstance(text, str):
+        return None
+
+    words = enumerate(re.split(r'\b', text))
+
+    def changecase(index, word):
+        casefold = word.casefold()
+        upper = word.upper()
+
+        if upper in abbreviations:
+            return upper
+        elif casefold in articles and index != 1:
+            return word.lower()
+        else:
+            return word.title()
+
+    return ''.join(changecase(i, w) for i, w in words)
+
+
+def run(args, records):
+    failures = []
+    failure_reporting = args.failure_reporting
+
+    articles = set()
+    if args.articles:
+        articles = set(args.articles)
+
+    abbreviations = set()
+    if args.abbreviations:
+        abbreviations = set(args.abbreviations)
+
+    for index, record in enumerate(records):
+        record = record.copy()
+        record_id = index
+
+        for field in args.titlecase_fields:
+            titlecased_string = titlecase(record.get(field, ""), articles, abbreviations)
+
+            failure_message = f"Failed to titlecase {field!r}:{record.get(field)!r} in record {record_id!r}"
+            if titlecased_string is None:
+                if failure_reporting is DataErrorMethod.ERROR_FIRST:
+                    raise AugurError(failure_message)
+
+                if failure_reporting is DataErrorMethod.WARN:
+                    print_err(f"WARNING: {failure_message}")
+
+                    # Keep track of failures for final summary
+                    failures.append((record_id, field, record.get(field)))
+            else:
+                record[field] = titlecased_string
+
+        yield record
+
+    if failure_reporting is not DataErrorMethod.SILENT and failures:
+        failure_message = (
+            "Unable to change to titlecase for the following (record, field, field value):\n" + \
+            '\n'.join(map(repr, failures))
+        )
+        if failure_reporting is DataErrorMethod.ERROR_ALL:
+            raise AugurError(failure_message)
+
+        elif failure_reporting is DataErrorMethod.WARN:
+            print_err(f"WARNING: {failure_message}")
+
+        else:
+            raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}")
diff --git a/docs/usage/cli/curate/index.rst b/docs/usage/cli/curate/index.rst
@@ -17,6 +17,7 @@ We will continue to add more subcommands as we identify other common data curati
     :maxdepth: 1
 
     normalize-strings
+    titlecase
     format-dates
     passthru
 
diff --git a/docs/usage/cli/curate/titlecase.rst b/docs/usage/cli/curate/titlecase.rst
@@ -0,0 +1,9 @@
+=================
+titlecase
+=================
+
+.. argparse::
+    :module: augur
+    :func: make_parser
+    :prog: augur
+    :path: curate titlecase
diff --git a/tests/functional/curate/cram/titlecase.t b/tests/functional/curate/cram/titlecase.t
@@ -0,0 +1,29 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="${AUGUR:-../../../../bin/augur}"
+
+
+Test output with articles and a mixture of lower and uppercase letters.
+
+  $ echo '{"title": "the night OF THE LIVING DEAD"}' \
+  >   | ${AUGUR} curate titlecase --titlecase-fields "title" --articles "a" "and" "of" "the" "le"
+  {"title": "The Night of the Living Dead"}
+
+Test output with hyphenated location.
+
+  $ echo '{"location": "BRAINE-LE-COMTE, FRANCE"}' \
+  >   | ${AUGUR} curate titlecase --titlecase-fields "location" --articles "a" "and" "of" "the" "le"
+  {"location": "Braine-le-Comte, France"}
+
+Test output with unicode characters
+
+  $ echo '{"location": "Auvergne-Rhône-Alpes" }' \
+  >   | ${AUGUR} curate titlecase --titlecase-fields "location"
+  {"location": "Auvergne-Rh\u00f4ne-Alpes"}
+
+Test output with abbreviations
+
+  $ echo '{"city": "Washington DC, USA"}' \
+  >   | ${AUGUR} curate titlecase --titlecase-fields "city" --abbreviations "USA" "DC"
+  {"city": "Washington DC, USA"}