ixc · ahankinson · Apr 13, 2021 · Jul 26, 2021 · Apr 26, 2024 · Apr 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -64,3 +64,5 @@ docs/_build/
 
 # PyBuilder
 target/
+.idea
+.DS_Store
diff --git a/edtf/convert.py b/edtf/convert.py
@@ -70,8 +70,7 @@ def trim_struct_time(st: struct_time, strip_time: bool = False) -> struct_time:
     """
     if strip_time:
         return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS)
-    else:
-        return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS)
+    return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS)
 
 
 def struct_time_to_jd(st: struct_time) -> float:
@@ -116,7 +115,7 @@ def jd_to_struct_time(jd: float) -> struct_time:
     return struct_time([year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS)
 
 
-def _roll_negative_time_fields(year, month, day, hour, minute, second):
+def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple:
     """
     Fix date/time fields which have nonsense negative values for any field
     except for year by rolling the overall date/time value backwards, treating
@@ -152,4 +151,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second):
         year += int(month / 12.0)  # Adjust by whole year in months
         year -= 1  # Subtract 1 for negative minutes
         month %= 12  # Convert negative month to positive remainder
-    return (year, month, day, hour, minute, second)
+
+    return year, month, day, hour, minute, second
diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py
@@ -1,7 +1,9 @@
 """Utilities to derive an EDTF string from an (English) natural language string."""
 
+import functools
 import re
 from datetime import datetime
+from typing import Optional
 
 from dateutil.parser import ParserError, parse
 
@@ -13,19 +15,46 @@
 DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0)
 DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0)
 
-SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])"
-LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)"
-CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
-CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)"
+SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])")
+LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)")
+CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?")
+CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]")
+CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)")
+ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b")
+TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b")
+PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$")
+SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)")
+BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b")
+AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b")
+APPROX_CHECK = re.compile(
+    r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)"
+)
+UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)")
+UNCERTAIN_REPL = re.compile(r"(\d{4})\?")
+MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s")
+MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s")
+
+APPROX_CENTURY_RE = re.compile(
+    r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
+)
+UNCERTAIN_CENTURY_RE = re.compile(
+    r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?"
+)
+
+APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)")
+UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?")
+
+MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b")
+MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b")
+MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b")
 
 # Set of RE rules that will cause us to abort text processing, since we know
 # the results will be wrong.
-REJECT_RULES = (
-    r".*dynasty.*",  # Don't parse '23rd Dynasty' to 'uuuu-uu-23'
-)
+REJECT_RULES = re.compile(r".*dynasty.*")  # Don't parse '23rd Dynasty' to 'uuuu-uu-23'
 
 
-def text_to_edtf(text):
+@functools.lru_cache
+def text_to_edtf(text: str) -> Optional[str]:
     """
     Generate EDTF string equivalent of a given natural language date string.
     """
@@ -35,35 +64,37 @@ def text_to_edtf(text):
     t = text.lower()
 
     # try parsing the whole thing
-    result = text_to_edtf_date(t)
+    result: Optional[str] = text_to_edtf_date(t)
 
     if not result:
         # split by list delims and move fwd with the first thing that returns a non-empty string.
         # TODO: assemble multiple dates into a {} or [] structure.
         for split in [",", ";", "or"]:
             for list_item in t.split(split):
                 # try parsing as an interval - split by '-'
-                toks = list_item.split("-")
+                toks: list[str] = list_item.split("-")
+
                 if len(toks) == 2:
                     d1 = toks[0].strip()
                     d2 = toks[1].strip()
 
                     # match looks from the beginning of the string, search
                     # looks anywhere.
 
-                    if re.match(r"\d\D\b", d2):  # 1-digit year partial e.g. 1868-9
+                    if re.match(
+                        ONE_DIGIT_PARTIAL_FIRST, d2
+                    ):  # 1-digit year partial e.g. 1868-9
                         if re.search(
-                            r"\b\d\d\d\d$", d1
+                            PARTIAL_CHECK, d1
                         ):  # TODO: evaluate it and see if it's a year
                             d2 = d1[-4:-1] + d2
-                    elif re.match(r"\d\d\b", d2):  # 2-digit year partial e.g. 1809-10
-                        if re.search(r"\b\d\d\d\d$", d1):
+                    elif re.match(
+                        TWO_DIGIT_PARTIAL_FIRST, d2
+                    ):  # 2-digit year partial e.g. 1809-10
+                        if re.search(PARTIAL_CHECK, d1):
                             d2 = d1[-4:-2] + d2
                     else:
-                        century_range_match = re.search(
-                            r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]",
-                            f"{d1}-{d2}",
-                        )
+                        century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}")
                         if century_range_match:
                             g = century_range_match.groups()
                             d1 = f"{g[0]}C"
@@ -73,7 +104,7 @@ def text_to_edtf(text):
                     r2 = text_to_edtf_date(d2)
 
                     if r1 and r2:
-                        result = r1 + "/" + r2
+                        result = f"{r1}/{r2}"
                         return result
 
                 # is it an either/or year "1838/1862" - that has a different
@@ -82,7 +113,7 @@ def text_to_edtf(text):
                 # This whole section could be more friendly.
 
                 else:
-                    int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item)
+                    int_match = re.search(SLASH_YEAR, list_item)
                     if int_match:
                         return f"[{int_match.group(1)}, {int_match.group(2)}]"
 
@@ -92,21 +123,19 @@ def text_to_edtf(text):
             if result:
                 break
 
-    is_before = re.findall(r"\bbefore\b", t)
-    is_before = is_before or re.findall(r"\bearlier\b", t)
-
-    is_after = re.findall(r"\bafter\b", t)
-    is_after = is_after or re.findall(r"\bsince\b", t)
-    is_after = is_after or re.findall(r"\blater\b", t)
+    is_before = re.findall(BEFORE_CHECK, t)
+    is_after = re.findall(AFTER_CHECK, t)
 
     if is_before:
-        result = f"/{result}"  # unknown is replaced with null for intervals
+        result = f"/{result}"
     elif is_after:
-        result = f"{result}/"  # unknown is replaced with null for intervals
+        result = f"{result}/"
+
     return result
 
 
-def text_to_edtf_date(text):
+@functools.lru_cache
+def text_to_edtf_date(text: str) -> Optional[str]:
     """
     Return EDTF string equivalent of a given natural language date string.
 
@@ -115,37 +144,28 @@ def text_to_edtf_date(text):
     differ are undefined.
     """
     if not text:
-        return
+        return None
 
     t = text.lower()
-    result = ""
+    result: str = ""
 
-    for reject_re in REJECT_RULES:
-        if re.match(reject_re, t):
-            return
+    if re.match(REJECT_RULES, t):
+        return None
 
     # matches on '1800s'. Needs to happen before is_decade.
-    could_be_century = re.findall(r"(\d{2}00)s", t)
+    could_be_century: list = re.findall(MIGHT_BE_CENTURY, t)
     # matches on '1800s' and '1910s'. Removes the 's'.
     # Needs to happen before is_uncertain because e.g. "1860s?"
-    t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t)
+    t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t)
 
     # detect approximation signifiers
     # a few 'circa' abbreviations just before the year
-    is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t)
+    is_approximate = re.findall(APPROX_CHECK, t)
     # the word 'circa' anywhere
-    is_approximate = is_approximate or re.findall(r"\bcirca\b", t)
-    # the word 'approx'/'around'/'about' anywhere
-    is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t)
-    # a ~ before a year-ish number
-    is_approximate = is_approximate or re.findall(r"\b~\d{4}", t)
-    # a ~ at the beginning
-    is_approximate = is_approximate or re.findall(r"^~", t)
 
     # detect uncertainty signifiers
-    t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t)
-    # the words uncertain/maybe/guess anywhere
-    is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t)
+    t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t)
+    is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t)
 
     # detect century forms
     is_century = re.findall(CENTURY_RE, t)
@@ -154,31 +174,28 @@ def text_to_edtf_date(text):
     is_ce = re.findall(CE_RE, t)
     if is_century:
         result = "%02dXX" % (int(is_century[0][0]) - 1,)
-        is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t)
-        is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t)
+        is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t)
+        is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t)
 
         try:
-            is_bc = is_century[0][-1] in ("bc", "bce")
-            if is_bc:
+            if is_century[0][-1] in ("bc", "bce"):
                 result = f"-{result}"
         except IndexError:
             pass
 
     elif is_ce:
         result = "%04d" % (int(is_ce[0][0]))
-        is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t)
-        is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t)
+        is_approximate = is_approximate or re.findall(APPROX_CE_RE, t)
+        is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t)
 
         try:
-            is_bc = is_ce[0][-1] in ("bc", "bce")
-            if is_bc:
+            if is_ce[0][-1] in ("bc", "bce"):
                 result = f"-{result}"
         except IndexError:
             pass
 
     else:
         # try dateutil.parse
-
         try:
             # parse twice, using different defaults to see what was
             # parsed and what was guessed.
@@ -205,34 +222,34 @@ def text_to_edtf_date(text):
 
         if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date():
             # couldn't parse anything - defaults are untouched.
-            return
+            return None
 
         date1 = dt1.isoformat()[:10]
         date2 = dt2.isoformat()[:10]
 
         # guess precision of 'unspecified' characters to use
-        mentions_year = re.findall(r"\byear\b.+(in|during)\b", t)
-        mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t)
-        mentions_day = re.findall(r"\bday\b.+(in|during)\b", t)
+        mentions_year = re.findall(MENTIONS_YEAR, t)
+        mentions_month = re.findall(MENTIONS_MONTH, t)
+        mentions_day = re.findall(MENTIONS_DAY, t)
 
         for i in range(len(date1)):
             # if the given year could be a century (e.g. '1800s') then use
             # approximate/uncertain markers to decide whether we treat it as
             # a century or a decade.
             if i == 2 and could_be_century and not (is_approximate or is_uncertain):
                 result += "X"
-            elif i == 3 and is_decade > 0:
+            elif i == 3 and is_decade:
                 if mentions_year:
-                    result += "X"  # previously year precision - now just X
+                    result += "X"  # year precision
                 else:
-                    result += "X"  # previously decade precision - now just X
+                    result += "X"  # decade precision
             elif date1[i] == date2[i]:
                 # since both attempts at parsing produced the same result
                 # it must be parsed value, not a default
                 result += date1[i]
             else:
                 # different values were produced, meaning that it's likely
-                # a default. Use 'X'
+                # a default. Use 'unspecified'
                 result += "X"
 
         # strip off unknown chars from end of string - except the first 4