From 86b94517f1df7d705564d5a93e0d0c431ccd9120 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Apr 2021 18:52:28 +0200 Subject: [PATCH 01/21] Enable packratting for pyparser Delivers significant performance improvements by caching previously computed results. --- edtf/parser/grammar.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index c028c6e..d612c5f 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -1,5 +1,9 @@ from pyparsing import Literal as L, ParseException, Optional, OneOrMore, \ - ZeroOrMore, oneOf, Regex, Combine, Word, NotAny, nums + ZeroOrMore, oneOf, Regex, Combine, Word, NotAny, nums, ParserElement + +# From the pyparsing performance improvement tips: +# https://github.com/pyparsing/pyparsing/wiki/Performance-Tips +ParserElement.enablePackrat() # (* ************************** Level 0 *************************** *) from edtf.parser.parser_classes import Date, DateAndTime, Interval, Unspecified, \ From 7fdf8dd8b649a5085d8f2aed3b66a8734f2ce915 Mon Sep 17 00:00:00 2001 From: jacobcolyvan Date: Mon, 26 Jul 2021 12:29:25 +1000 Subject: [PATCH 02/21] #37 update for Django 3.x compat --- edtf/fields.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edtf/fields.py b/edtf/fields.py index 83d10a7..52b9171 100644 --- a/edtf/fields.py +++ b/edtf/fields.py @@ -53,7 +53,7 @@ def deconstruct(self): del kwargs["max_length"] return name, path, args, kwargs - def from_db_value(self, value, expression, connection, context): + def from_db_value(self, value, expression, connection, context=None): # Converting values to Python objects if not value: return None From 6e4a627df5447b76db492b1603f95bbd55524346 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:43:38 +0200 Subject: [PATCH 03/21] Minor updates --- edtf/natlang/en.py | 3 ++- poetry.lock | 45 +++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 18 ++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index ec7842b..5263e07 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -89,6 +89,7 @@ def text_to_edtf(text): is_before = re.findall(r'\bbefore\b', t) is_before = is_before or re.findall(r'\bearlier\b', t) + is_before = is_before or re.findall(r'\baprés\b', t) is_after = re.findall(r'\bafter\b', t) is_after = is_after or re.findall(r'\bsince\b', t) @@ -133,7 +134,7 @@ def text_to_edtf_date(text): is_approximate = is_approximate or re.findall(r'\bcirca\b', t) # the word 'approx'/'around'/'about' anywhere is_approximate = is_approximate or \ - re.findall(r'\b(approx|around|about)', t) + re.findall(r'\b(approx|approximately|around|about)', t) # a ~ before a year-ish number is_approximate = is_approximate or re.findall(r'\b~\d{4}', t) # a ~ at the beginning diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..745843e --- /dev/null +++ b/poetry.lock @@ -0,0 +1,45 @@ +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. + +[[package]] +name = "pyparsing" +version = "3.1.2" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.6.8" +files = [ + {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, + {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "822c6f7ddf2552d097c1bfc8399a2492c845c74cb4576a423adf3ad62850ffc3" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f203360 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[tool.poetry] +name = "python-edtf" +version = "0.1.0" +description = "" +authors = ["Andrew Hankinson "] +readme = "README.md" +packages = [{include = "python_edtf"}] + +[tool.poetry.dependencies] +python = "^3.11" +python-dateutil = "^2.9.0.post0" +pyparsing = "^3.1.2" +six = "^1.16.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" From 80fdd60cbb590d7139341293185628d6aa8cac5b Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:49:58 +0200 Subject: [PATCH 04/21] Update dependency management --- pyproject.toml | 2 +- setup.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f203360..f1d7c5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "python-edtf" +name = "edtf" version = "0.1.0" description = "" authors = ["Andrew Hankinson "] diff --git a/setup.py b/setup.py index f0f1849..f2cc7d5 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ from __future__ import print_function import setuptools -import sys def readme(): with open('README.md') as f: From c12d759732d393ac66faa462b8d61b057c675d17 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:55:52 +0200 Subject: [PATCH 05/21] Deps --- poetry.lock | 4 ++-- pyproject.toml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 745843e..c4b40b6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -41,5 +41,5 @@ files = [ [metadata] lock-version = "2.0" -python-versions = "^3.11" -content-hash = "822c6f7ddf2552d097c1bfc8399a2492c845c74cb4576a423adf3ad62850ffc3" +python-versions = "^3.9" +content-hash = "e6be32f86f1a6af0695f6846b57ed289e015b5634c7f574c45800095a84e2200" diff --git a/pyproject.toml b/pyproject.toml index f1d7c5f..9af9ee4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] name = "edtf" -version = "0.1.0" +version = "4.0.1+enh" description = "" authors = ["Andrew Hankinson "] readme = "README.md" -packages = [{include = "python_edtf"}] +packages = [{include = "edtf"}] [tool.poetry.dependencies] -python = "^3.11" +python = "^3.9" python-dateutil = "^2.9.0.post0" pyparsing = "^3.1.2" six = "^1.16.0" From 6e508d016e9bbcc49b90d3c88ca3512d69a0d193 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 23 Jul 2024 17:03:14 +0200 Subject: [PATCH 06/21] Optimized regexes --- edtf/natlang/en.py | 126 ++++++++++++++++++++++----------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 5263e07..4f68f21 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,9 +1,10 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" from datetime import datetime +from typing import Optional + from dateutil.parser import parse import re from edtf import appsettings -from six.moves import xrange # two dates where every digit of an ISO date representation is different, @@ -12,24 +13,43 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = r'(-?)([\du])([\dxu])([\dxu])([\dxu])' -LONG_YEAR_RE = r'y(-?)([1-9]\d\d\d\d+)' -CENTURY_RE = r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?' -CE_RE = r'(\d{1,4}) (ad|ce|bc|bce)' +SHORT_YEAR_RE = re.compile(r'(-?)([\du])([\dxu])([\dxu])([\dxu])') +LONG_YEAR_RE = re.compile(r'y(-?)([1-9]\d\d\d\d+)') +CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') +CENTURY_RANGE = re.compile(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]') +CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)') +ONE_DIGIT_PARTIAL_FIRST = re.compile(r'\d\D\b') +TWO_DIGIT_PARTIAL_FIRST = re.compile(r'\d\d\b') +PARTIAL_CHECK = re.compile(r'\b\d\d\d\d$') +SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") +BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") +AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") +APPROX_CHECK = re.compile(r'\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)') +UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") +UNCERTAIN_REPL = re.compile(r'(\d{4})\?') +MIGHT_BE_CENTURY = re.compile(r'(\d{2}00)s') +MIGHT_BE_DECADE = re.compile(r'(\d{3}0)s') + +APPROX_CENTURY_RE = re.compile(r'\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') +UNCERTAIN_CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?') + +APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') +UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') + # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. REJECT_RULES = ( - r'.*dynasty.*', # Don't parse '23rd Dynasty' to 'uuuu-uu-23' + re.compile(r'.*dynasty.*'), # Don't parse '23rd Dynasty' to 'uuuu-uu-23' ) -def text_to_edtf(text): +def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. """ if not text: - return + return None t = text.lower() @@ -51,18 +71,18 @@ def text_to_edtf(text): # match looks from the beginning of the string, search # looks anywhere. - if re.match(r'\d\D\b', d2): # 1-digit year partial e.g. 1868-9 - if re.search(r'\b\d\d\d\d$', d1): # TODO: evaluate it and see if it's a year + if re.match(ONE_DIGIT_PARTIAL_FIRST, d2): # 1-digit year partial e.g. 1868-9 + if re.search(PARTIAL_CHECK, d1): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(r'\d\d\b', d2): # 2-digit year partial e.g. 1809-10 - if re.search(r'\b\d\d\d\d$', d1): + elif re.match(TWO_DIGIT_PARTIAL_FIRST, d2): # 2-digit year partial e.g. 1809-10 + if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: - century_range_match = re.search(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]', "%s-%s" % (d1,d2)) + century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}") if century_range_match: g = century_range_match.groups() - d1 = "%sC" % g[0] - d2 = "%sC" % g[2] + d1 = f"{g[0]}C" + d2 = f"{g[2]}C" r1 = text_to_edtf_date(d1) r2 = text_to_edtf_date(d2) @@ -77,9 +97,9 @@ def text_to_edtf(text): # This whole section could be more friendly. else: - int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item) + int_match = re.search(SLASH_YEAR, list_item) if int_match: - return "[%s, %s]" % (int_match.group(1), int_match.group(2)) + return f"[{int_match.group(1)}, {int_match.group(2)}]" result = text_to_edtf_date(list_item) if result: @@ -87,23 +107,18 @@ def text_to_edtf(text): if result: break - is_before = re.findall(r'\bbefore\b', t) - is_before = is_before or re.findall(r'\bearlier\b', t) - is_before = is_before or re.findall(r'\baprés\b', t) - - is_after = re.findall(r'\bafter\b', t) - is_after = is_after or re.findall(r'\bsince\b', t) - is_after = is_after or re.findall(r'\blater\b', t) + is_before = re.findall(BEFORE_CHECK, t) + is_after = re.findall(AFTER_CHECK, t) if is_before: - result = u"unknown/%s" % result + result = f"unknown/{result}" elif is_after: - result = u"%s/unknown" % result + result = f"{result}/unknown" return result -def text_to_edtf_date(text): +def text_to_edtf_date(text) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. @@ -112,39 +127,29 @@ def text_to_edtf_date(text): differ are undefined. """ if not text: - return + return None t = text.lower() result = '' for reject_re in REJECT_RULES: if re.match(reject_re, t): - return + return None # matches on '1800s'. Needs to happen before is_decade. - could_be_century = re.findall(r'(\d{2}00)s', t) + could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(r'(\d{3}0)s', r'\1', t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r'\1', t) # detect approximation signifiers # a few 'circa' abbreviations just before the year - is_approximate = re.findall(r'\b(ca?\.?) ?\d{4}', t) + is_approximate = re.findall(APPROX_CHECK, t) # the word 'circa' anywhere - is_approximate = is_approximate or re.findall(r'\bcirca\b', t) - # the word 'approx'/'around'/'about' anywhere - is_approximate = is_approximate or \ - re.findall(r'\b(approx|approximately|around|about)', t) - # a ~ before a year-ish number - is_approximate = is_approximate or re.findall(r'\b~\d{4}', t) - # a ~ at the beginning - is_approximate = is_approximate or re.findall(r'^~', t) # detect uncertainty signifiers - t, is_uncertain = re.subn(r'(\d{4})\?', r'\1', t) - # the words uncertain/maybe/guess anywhere - is_uncertain = is_uncertain or re.findall( - r'\b(uncertain|possibly|maybe|guess)', t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r'\1', t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms is_century = re.findall(CENTURY_RE, t) @@ -153,27 +158,23 @@ def text_to_edtf_date(text): is_ce = re.findall(CE_RE, t) if is_century: result = "%02dxx" % (int(is_century[0][0]) - 1,) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CENTURY_RE, t) - is_uncertain = is_uncertain or re.findall(CENTURY_RE + r'\?', t) + is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) try: - is_bc = is_century[0][-1] in ("bc", "bce") - if is_bc: - result = "-%s" % result + if is_century[0][-1] in ("bc", "bce"): + result = f"-{result}" except IndexError: pass elif is_ce: result = "%04d" % (int(is_ce[0][0])) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CE_RE, t) - is_uncertain = is_uncertain or re.findall(CE_RE + r'\?', t) + is_approximate = is_approximate or re.findall(APPROX_CE_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t) try: - is_bc = is_ce[0][-1] in ("bc", "bce") - if is_bc: - result = "-%s" % result + if is_ce[0][-1] in ("bc", "bce"): + result = f"-{result}" except IndexError: pass @@ -200,12 +201,12 @@ def text_to_edtf_date(text): ) except ValueError: - return + return None if dt1.date() == DEFAULT_DATE_1.date() and \ dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. - return + return None date1 = dt1.isoformat()[:10] date2 = dt2.isoformat()[:10] @@ -215,14 +216,13 @@ def text_to_edtf_date(text): mentions_month = re.findall(r'\bmonth\b.+(in|during)\b', t) mentions_day = re.findall(r'\bday\b.+(in|during)\b', t) - for i in xrange(len(date1)): + for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use # approximate/uncertain markers to decide whether we treat it as # a century or a decade. - if i == 2 and could_be_century and \ - not (is_approximate or is_uncertain): + if i == 2 and could_be_century and not (is_approximate or is_uncertain): result += 'x' - elif i == 3 and is_decade > 0: + elif i == 3 and is_decade: if mentions_year: result += 'u' # year precision else: @@ -238,7 +238,7 @@ def text_to_edtf_date(text): # strip off unknown chars from end of string - except the first 4 - for i in reversed(xrange(len(result))): + for i in reversed(range(len(result))): if result[i] not in ('u', 'x', '-'): smallest_length = 4 From f2252f03c23b1f7a6a153ccf750e97a94ce71dd2 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 23 Jul 2024 17:18:26 +0200 Subject: [PATCH 07/21] Package updates --- edtf/convert.py | 8 +- edtf/jdutil.py | 32 +++---- edtf/natlang/en.py | 11 ++- edtf/natlang/tests.py | 4 +- edtf/parser/grammar.py | 14 +-- edtf/parser/parser_classes.py | 159 +++++++++++++++++----------------- edtf/parser/tests.py | 66 +++++++------- 7 files changed, 152 insertions(+), 142 deletions(-) diff --git a/edtf/convert.py b/edtf/convert.py index c1bfd3a..de1f2a2 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -59,8 +59,7 @@ def trim_struct_time(st, strip_time=False): """ if strip_time: return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) + return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) def struct_time_to_jd(st): @@ -106,7 +105,7 @@ def jd_to_struct_time(jd): ) -def _roll_negative_time_fields(year, month, day, hour, minute, second): +def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple: """ Fix date/time fields which have nonsense negative values for any field except for year by rolling the overall date/time value backwards, treating @@ -142,4 +141,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second): year += int(month / 12.0) # Adjust by whole year in months year -= 1 # Subtract 1 for negative minutes month %= 12 # Convert negative month to positive remainder - return (year, month, day, hour, minute, second) + + return year, month, day, hour, minute, second diff --git a/edtf/jdutil.py b/edtf/jdutil.py index 9fabdd1..4a12b58 100644 --- a/edtf/jdutil.py +++ b/edtf/jdutil.py @@ -17,7 +17,8 @@ # 10-14-1582 never occurred. Python datetime objects will produce incorrect # time deltas if one date is from before 10-15-1582. -def mjd_to_jd(mjd): + +def mjd_to_jd(mjd: float) -> float: """ Convert Modified Julian Day to Julian Day. @@ -30,13 +31,11 @@ def mjd_to_jd(mjd): ------- jd : float Julian Day - - """ return mjd + 2400000.5 -def jd_to_mjd(jd): +def jd_to_mjd(jd: float) -> float: """ Convert Julian Day to Modified Julian Day @@ -54,7 +53,7 @@ def jd_to_mjd(jd): return jd - 2400000.5 -def date_to_jd(year,month,day): +def date_to_jd(year: int, month: int, day: float) -> float: """ Convert a date to Julian Day. @@ -117,7 +116,7 @@ def date_to_jd(year,month,day): return jd -def jd_to_date(jd): +def jd_to_date(jd: float) -> (int, int, float): """ Convert Julian Day to date. @@ -184,7 +183,10 @@ def jd_to_date(jd): return year, month, day -def hmsm_to_days(hour=0,min=0,sec=0,micro=0): +def hmsm_to_days(hour: int = 0, + min: int = 0, + sec: int = 0, + micro: int = 0) -> float: """ Convert hours, minutes, seconds, and microseconds to fractional days. @@ -222,7 +224,7 @@ def hmsm_to_days(hour=0,min=0,sec=0,micro=0): return days / 24. -def days_to_hmsm(days): +def days_to_hmsm(days: float) -> (int, int, int, int): """ Convert fractional days to hours, minutes, seconds, and microseconds. Precision beyond microseconds is rounded to the nearest microsecond. @@ -271,7 +273,7 @@ def days_to_hmsm(days): return int(hour), int(min), int(sec), int(micro) -def datetime_to_jd(date): +def datetime_to_jd(date: dt.datetime) -> float: """ Convert a `datetime.datetime` object to Julian Day. @@ -298,7 +300,7 @@ def datetime_to_jd(date): return date_to_jd(date.year,date.month,days) -def jd_to_datetime(jd): +def jd_to_datetime(jd: float) -> dt.datetime: """ Convert a Julian Day to an `jdutil.datetime` object. @@ -328,7 +330,7 @@ def jd_to_datetime(jd): return datetime(year,month,day,hour,min,sec,micro) -def timedelta_to_days(td): +def timedelta_to_days(td: dt.timedelta) -> float: """ Convert a `datetime.timedelta` object to a total number of days. @@ -372,7 +374,7 @@ class datetime(dt.datetime): datetime.datetime : Parent class. """ - def __add__(self,other): + def __add__(self, other): if not isinstance(other,dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -383,7 +385,7 @@ def __add__(self,other): return jd_to_datetime(combined) - def __radd__(self,other): + def __radd__(self, other): if not isinstance(other,dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -394,7 +396,7 @@ def __radd__(self,other): return jd_to_datetime(combined) - def __sub__(self,other): + def __sub__(self, other): if isinstance(other,dt.timedelta): days = timedelta_to_days(other) @@ -412,7 +414,7 @@ def __sub__(self,other): s += "datetime.timedelta, jdutil.datetime and datetime.datetime" raise TypeError(s) - def __rsub__(self,other): + def __rsub__(self, other): if not isinstance(other, (datetime,dt.datetime)): s = "jdutil.datetime supports '-' with: " s += "jdutil.datetime and datetime.datetime" diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 4f68f21..8cb72c4 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -36,6 +36,11 @@ APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') +MENTIONS_YEAR = re.compile(r'\byear\b.+(in|during)\b') +MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') +MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') + + # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. @@ -212,9 +217,9 @@ def text_to_edtf_date(text) -> Optional[str]: date2 = dt2.isoformat()[:10] # guess precision of 'unspecified' characters to use - mentions_year = re.findall(r'\byear\b.+(in|during)\b', t) - mentions_month = re.findall(r'\bmonth\b.+(in|during)\b', t) - mentions_day = re.findall(r'\bday\b.+(in|during)\b', t) + mentions_year = re.findall(MENTIONS_YEAR, t) + mentions_month = re.findall(MENTIONS_MONTH, t) + mentions_day = re.findall(MENTIONS_DAY, t) for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use diff --git a/edtf/natlang/tests.py b/edtf/natlang/tests.py index ea137d2..d18ec76 100644 --- a/edtf/natlang/tests.py +++ b/edtf/natlang/tests.py @@ -207,8 +207,8 @@ def test_natlang(self): """ for i, o in EXAMPLES: e = text_to_edtf(i) - print("%s => %s" % (i, e)) - self.assertEqual(e, o) + print(f"{i} => {e}") + self.assertEqual(e, o, msg=f"Testing {i}") if __name__ == '__main__': diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index d612c5f..14cb3a4 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -282,14 +282,16 @@ def f(toks): edtfParser = level0Expression("level0") ^ level1Expression("level1") ^ level2Expression("level2") -def parse_edtf(str, parseAll=True, fail_silently=False): +def parse_edtf(inp: str, parse_all: bool = True, fail_silently: bool = False): + if not inp: + raise ParseException("You must supply some input text") + try: - if not str: - raise ParseException("You must supply some input text") - p = edtfParser.parseString(str.strip(), parseAll) - if p: - return p[0] + p = edtfParser.parseString(inp.strip(), parse_all) except ParseException as e: if fail_silently: return None raise EDTFParseException(e) + + if p: + return p[0] diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index b670296..ae7adb4 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -3,6 +3,7 @@ from time import struct_time from datetime import date, datetime from operator import add, sub +from typing import Optional from dateutil.relativedelta import relativedelta @@ -22,7 +23,7 @@ PRECISION_DAY = "day" -def days_in_month(year, month): +def days_in_month(year: int, month: int) -> dict: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by @@ -85,11 +86,15 @@ def apply_delta(op, time_struct, delta): class EDTFObject(object): """ - Object to attact to a parser to become instantiated when the parser + Object to attach to a parser to become instantiated when the parser completes. """ parser = None + def __init__(self, *args, **kwargs): + errmsg: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" + raise NotImplementedError(f"{errmsg} is not implemented.") + @classmethod def set_parser(cls, p): cls.parser = p @@ -99,7 +104,7 @@ def set_parser(cls, p): def parse_action(cls, toks): kwargs = toks.asDict() try: - return cls(**kwargs) # replace the token list with the class + return cls(**kwargs) # replace the token list with the class except Exception as e: print("trying to %s.__init__(**%s)" % (cls.__name__, kwargs)) raise e @@ -109,19 +114,12 @@ def parse(cls, s): return cls.parser.parseString(s)[0] def __repr__(self): - return "%s: '%s'" % (type(self).__name__, str(self)) - - def __init__(self, *args, **kwargs): - str = "%s.__init__(*%s, **%s)" % ( - type(self).__name__, - args, kwargs, - ) - raise NotImplementedError("%s is not implemented." % str) + return f"{type(self).__name__}: '{str(self)}'" def __str__(self): raise NotImplementedError - def _strict_date(self, lean): + def _strict_date(self, lean: str): raise NotImplementedError def lower_strict(self): @@ -130,7 +128,7 @@ def lower_strict(self): def upper_strict(self): return self._strict_date(lean=LATEST) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str): """ Subclasses should override this to pad based on how precise they are. """ @@ -216,41 +214,40 @@ def __le__(self, other): # (* ************************** Level 0 *************************** *) class Date(EDTFObject): + def __init__(self, year=None, month=None, day=None, **kwargs): + for param in ('date', 'lower', 'upper'): + if param in kwargs: + self.__init__(**kwargs[param]) + return + + self.year = year # Year is required, but sometimes passed in as a 'date' dict. + self.month = month + self.day = day - def set_year(self, y): + def set_year(self, y: int): if y is None: raise AttributeError("Year must not be None") self._year = y - def get_year(self): + def get_year(self) -> int: return self._year year = property(get_year, set_year) - def set_month(self, m): + def set_month(self, m: Optional[int]): self._month = m - if m == None: + if m is None: self.day = None - def get_month(self): + def get_month(self) -> Optional[int]: return self._month month = property(get_month, set_month) - def __init__(self, year=None, month=None, day=None, **kwargs): - for param in ('date', 'lower', 'upper'): - if param in kwargs: - self.__init__(**kwargs[param]) - return - - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day - def __str__(self): r = self.year if self.month: - r += "-%s" % self.month + r += f"-{self.month}" if self.day: - r += "-%s" % self.day + r += f"-{self.day}" return r def isoformat(self, default=date.max): @@ -260,14 +257,14 @@ def isoformat(self, default=date.max): int(self.day or default.day), ) - def _precise_year(self, lean): + def _precise_year(self, lean: str): # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: return int(re.sub(r'[xu]', r'0', self.year)) else: return int(re.sub(r'[xu]', r'9', self.year)) - def _precise_month(self, lean): + def _precise_month(self, lean: str): if self.month and self.month != "uu": try: return int(self.month) @@ -276,7 +273,7 @@ def _precise_month(self, lean): else: return 1 if lean == EARLIEST else 12 - def _precise_day(self, lean): + def _precise_day(self, lean: str): if not self.day or self.day == 'uu': if lean == EARLIEST: return 1 @@ -343,7 +340,7 @@ def __init__(self, lower, upper): self.upper = upper def __str__(self): - return "%s/%s" % (self.lower, self.upper) + return f"{self.lower}/{self.upper}" def _strict_date(self, lean): if lean == EARLIEST: @@ -416,8 +413,8 @@ def __str__(self): def _strict_date(self, lean): if self.date == "open": return dt_to_struct_time(date.today()) - if self.date =="unknown": - return None # depends on the other date + if self.date == "unknown": + return None # depends on the other date return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): @@ -454,12 +451,12 @@ def __init__(self, year): self.year = year def __str__(self): - return "y%s" % self.year + return f"y{self.year}" def _precise_year(self): return int(self.year) - def _strict_date(self, lean): + def _strict_date(self, lean: str): py = self._precise_year() if lean == EARLIEST: return struct_time( @@ -478,30 +475,26 @@ def __init__(self, year, season, **kwargs): self.day = None def __str__(self): - return "%s-%s" % (self.year, self.season) + return f"{self.year}-{self.season}" def _precise_month(self, lean): rng = appsettings.SEASON_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] - else: - return rng[1] + + return rng[1] # (* ************************** Level 2 *************************** *) class PartialUncertainOrApproximate(Date): - - def set_year(self, y): # Year can be None. - self._year = y - year = property(Date.get_year, set_year) - def __init__( self, year=None, month=None, day=None, - year_ua=False, month_ua = False, day_ua = False, - year_month_ua = False, month_day_ua = False, - ssn=None, season_ua=False, all_ua=False + year_ua: Optional[UA] = None, month_ua: Optional[UA] = None, + day_ua: Optional[UA] = None, year_month_ua: Optional[UA] = None, + month_day_ua: Optional[UA] = None, ssn=None, + season_ua: Optional[UA] = None, all_ua: Optional[UA] = None ): self.year = year self.month = month @@ -520,56 +513,60 @@ def __init__( self.all_ua = all_ua def __str__(self): - if self.season_ua: - return "%s%s" % (self.season, self.season_ua) + return f"{self.season}{self.season_ua}" if self.year_ua: - y = "%s%s" % (self.year, self.year_ua) + y = f"{self.year}{self.year_ua}" else: y = str(self.year) if self.month_ua: - m = "(%s)%s" % (self.month, self.month_ua) + m = f"({self.month}){self.month_ua}" else: m = str(self.month) if self.day: if self.day_ua: - d = "(%s)%s" % (self.day, self.day_ua) + d = f"({self.day}){self.day_ua}" else: d = str(self.day) else: d = None if self.year_month_ua: # year/month approximate. No brackets needed. - ym = "%s-%s%s" % (y, m, self.year_month_ua) + ym = f"{y}-{m}{self.year_month_ua}" if d: - result = "%s-%s" % (ym, d) + result = f"{ym}-{d}" else: result = ym + elif self.month_day_ua: - if self.year_ua: # we don't need the brackets round month and day - result = "%s-%s-%s%s" % (y, m, d, self.month_day_ua) + if self.year_ua: # we don't need the brackets round month and day + result = f"{y}-{m}-{d}{self.month_day_ua}" else: - result = "%s-(%s-%s)%s" % (y, m, d, self.month_day_ua) + result = f"{y}-({m}-{d}){self.month_day_ua}" else: if d: - result = "%s-%s-%s" % (y, m, d) + result = f"{y}-{m}-{d}" else: - result = "%s-%s" % (y, m) + result = f"{y}-{m}" if self.all_ua: - result = "(%s)%s" % (result, self.all_ua) + result = f"({result}){self.all_ua}" return result - def _precise_year(self, lean): + def set_year(self, y): # Year can be None. + self._year = y + year = property(Date.get_year, set_year) + + def _precise_year(self, lean: str): if self.season: return self.season._precise_year(lean) return super(PartialUncertainOrApproximate, self)._precise_year(lean) - def _precise_month(self, lean): + def _precise_month(self, lean: str): if self.season: return self.season._precise_month(lean) return super(PartialUncertainOrApproximate, self)._precise_month(lean) @@ -638,7 +635,7 @@ def __init__(self, lower=None, upper=None): self.upper = upper def __str__(self): - return "%s..%s" % (self.lower or '', self.upper or '') + return f"{self.lower or ''}..{self.upper or ''}" class EarlierConsecutives(Consecutives): @@ -650,41 +647,40 @@ class LaterConsecutives(Consecutives): class OneOfASet(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): - return "[%s]" % (", ".join([str(o) for o in self.objects])) + return f"[{', '.join([str(o) for o in self.objects])}]" - def _strict_date(self, lean): + def _strict_date(self, lean: str): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) + + return min([x._strict_date(lean) for x in self.objects]) class MultipleDates(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): - return "{%s}" % (", ".join([str(o) for o in self.objects])) + return f"{{{', '.join([str(o) for o in self.objects])}}}" def _strict_date(self, lean): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) + return min([x._strict_date(lean) for x in self.objects]) class MaskedPrecision(Date): @@ -695,12 +691,13 @@ class Level2Interval(Level1Interval): def __init__(self, lower, upper): # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a - # bug in the grammer that provides us with single-item lists of + # bug in the grammar that provides us with single-item lists of # `PartialUncertainOrApproximate` items for lower/upper values. if isinstance(lower, (tuple, list)) and len(lower) == 1: self.lower = lower[0] else: self.lower = lower + if isinstance(lower, (tuple, list)) and len(upper) == 1: self.upper = upper[0] else: @@ -718,7 +715,7 @@ def _precise_year(self): def get_year(self): if self.precision: - return '%se%sp%s' % (self.base, self.exponent, self.precision) + return f'{self.base}e{self.exponent}p{self.precision}' else: - return '%se%s' % (self.base, self.exponent) + return f'{self.base}e{self.exponent}' year = property(get_year) diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index f9dde42..77c2ad3 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -3,10 +3,11 @@ from datetime import date from time import struct_time +from pyparsing import ParseException + from edtf.parser.grammar import parse_edtf as parse from edtf.parser.parser_classes import EDTFObject, TIME_EMPTY_TIME, \ TIME_EMPTY_EXTRAS -from edtf.parser.edtf_exceptions import EDTFParseException # Example object types and attributes. # the first item in each tuple is the input EDTF string, and expected parse result. @@ -192,17 +193,30 @@ None, '', 'not a edtf string', - 'y17e7-12-26', # not implemented - '2016-13-08', # wrong day order - '2016-02-39', # out of range + 'y17e7-12-26', # not implemented + '2016-13-08', # wrong day order + '2016-02-39', # out of range '-0000-01-01', # negative zero year ) class TestParsing(unittest.TestCase): + def iso_to_struct_time(self, iso_date): + """ Convert YYYY-mm-dd date strings to time structs """ + if iso_date[0] == '-': + is_negative = True + iso_date = iso_date[1:] + else: + is_negative = False + y, mo, d = [int(i) for i in iso_date.split('-')] + if is_negative: + y *= -1 + return struct_time( + [y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + def test_non_parsing(self): for i in BAD_EXAMPLES: - self.assertRaises(EDTFParseException, parse, i) + self.assertRaises(ParseException, parse, i) def test_date_values(self): """ @@ -217,13 +231,15 @@ def test_date_values(self): else: o = i - sys.stdout.write("parsing '%s'" % i) + sys.stdout.write(f"parsing '{i}'") f = parse(i) - sys.stdout.write(" => %s()\n" % type(f).__name__) + sys.stdout.write(f" => {type(f).__name__}()\n") self.assertIsInstance(f, EDTFObject) - self.assertEqual(str(f), o) + self.assertEqual(str(f), o, msg=f"Testing {i}") - if len(e) == 5: + if len(e) == 1: + continue + elif len(e) == 5: expected_lower_strict = e[1] expected_upper_strict = e[2] expected_lower_fuzzy = e[3] @@ -243,33 +259,21 @@ def test_date_values(self): expected_upper_strict = e[1] expected_lower_fuzzy = e[1] expected_upper_fuzzy = e[1] - if len(e) == 1: + else: + print(f"Unexpected value {e}; skipping.") continue - def iso_to_struct_time(iso_date): - """ Convert YYYY-mm-dd date strings to time structs """ - if iso_date[0] == '-': - is_negative = True - iso_date = iso_date[1:] - else: - is_negative = False - y, mo, d = [int(i) for i in iso_date.split('-')] - if is_negative: - y *= -1 - return struct_time( - [y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - # Convert string date representations into `struct_time`s - expected_lower_strict = iso_to_struct_time(expected_lower_strict) - expected_upper_strict = iso_to_struct_time(expected_upper_strict) - expected_lower_fuzzy = iso_to_struct_time(expected_lower_fuzzy) - expected_upper_fuzzy = iso_to_struct_time(expected_upper_fuzzy) + exp_lower_str = self.iso_to_struct_time(expected_lower_strict) + exp_upper_str = self.iso_to_struct_time(expected_upper_strict) + exp_lower_fuzz = self.iso_to_struct_time(expected_lower_fuzzy) + exp_upper_fuzz = self.iso_to_struct_time(expected_upper_fuzzy) try: - self.assertEqual(f.lower_strict(), expected_lower_strict) - self.assertEqual(f.upper_strict(), expected_upper_strict) - self.assertEqual(f.lower_fuzzy(), expected_lower_fuzzy) - self.assertEqual(f.upper_fuzzy(), expected_upper_fuzzy) + self.assertEqual(f.lower_strict(), exp_lower_str) + self.assertEqual(f.upper_strict(), exp_upper_str) + self.assertEqual(f.lower_fuzzy(), exp_lower_fuzz) + self.assertEqual(f.upper_fuzzy(), exp_upper_fuzz) except Exception as x: # Write to stdout for manual debugging, I guess sys.stdout.write(str(x)) From 06ab934befb7a665301587134794ddbc50b60964 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Wed, 24 Jul 2024 11:18:51 +0200 Subject: [PATCH 08/21] Further optimizations --- edtf/natlang/en.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 8cb72c4..d7d7b8d 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,4 +1,5 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" +import functools from datetime import datetime from typing import Optional @@ -40,15 +41,12 @@ MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') - - # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = ( - re.compile(r'.*dynasty.*'), # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -) +REJECT_RULES = re.compile(r'.*dynasty.*') # Don't parse '23rd Dynasty' to 'uuuu-uu-23' +@functools.lru_cache() def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -123,7 +121,8 @@ def text_to_edtf(text: str) -> Optional[str]: return result -def text_to_edtf_date(text) -> Optional[str]: +@functools.lru_cache() +def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. @@ -137,9 +136,8 @@ def text_to_edtf_date(text) -> Optional[str]: t = text.lower() result = '' - for reject_re in REJECT_RULES: - if re.match(reject_re, t): - return None + if re.match(REJECT_RULES, t): + return None # matches on '1800s'. Needs to happen before is_decade. could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) @@ -185,7 +183,6 @@ def text_to_edtf_date(text) -> Optional[str]: else: # try dateutil.parse - try: # parse twice, using different defaults to see what was # parsed and what was guessed. From c9cb56fe7dfcfe3f55ee981106bce7e73e7b7554 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 12 Aug 2024 14:27:41 +0200 Subject: [PATCH 09/21] Update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ba74660..4d58675 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,5 @@ docs/_build/ # PyBuilder target/ +.idea +.DS_Store From 9e51373eea989f4ea306408138b31ce53bdef1ab Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 15:01:47 +0200 Subject: [PATCH 10/21] Black formatting, updates --- edtf/natlang/en.py | 101 +++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index d7d7b8d..191199e 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -14,36 +14,42 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = re.compile(r'(-?)([\du])([\dxu])([\dxu])([\dxu])') -LONG_YEAR_RE = re.compile(r'y(-?)([1-9]\d\d\d\d+)') -CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') -CENTURY_RANGE = re.compile(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]') -CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)') -ONE_DIGIT_PARTIAL_FIRST = re.compile(r'\d\D\b') -TWO_DIGIT_PARTIAL_FIRST = re.compile(r'\d\d\b') -PARTIAL_CHECK = re.compile(r'\b\d\d\d\d$') +SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])") +LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") +CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") +CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") +CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)") +ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b") +TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b") +PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$") SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") -APPROX_CHECK = re.compile(r'\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)') +APPROX_CHECK = re.compile( + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)" +) UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") -UNCERTAIN_REPL = re.compile(r'(\d{4})\?') -MIGHT_BE_CENTURY = re.compile(r'(\d{2}00)s') -MIGHT_BE_DECADE = re.compile(r'(\d{3}0)s') +UNCERTAIN_REPL = re.compile(r"(\d{4})\?") +MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s") +MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s") -APPROX_CENTURY_RE = re.compile(r'\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') -UNCERTAIN_CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?') +APPROX_CENTURY_RE = re.compile( + r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" +) +UNCERTAIN_CENTURY_RE = re.compile( + r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?" +) -APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') -UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') +APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)") +UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?") -MENTIONS_YEAR = re.compile(r'\byear\b.+(in|during)\b') -MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') -MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') +MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b") +MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b") +MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b") # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = re.compile(r'.*dynasty.*') # Don't parse '23rd Dynasty' to 'uuuu-uu-23' +REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' @functools.lru_cache() @@ -57,16 +63,16 @@ def text_to_edtf(text: str) -> Optional[str]: t = text.lower() # try parsing the whole thing - result = text_to_edtf_date(t) + result: Optional[str] = text_to_edtf_date(t) if not result: # split by list delims and move fwd with the first thing that returns a non-empty string. # TODO: assemble multiple dates into a {} or [] structure. for split in [",", ";", "or"]: for list_item in t.split(split): - # try parsing as an interval - split by '-' - toks = list_item.split("-") + toks: list[str] = list_item.split("-") + if len(toks) == 2: d1 = toks[0].strip() d2 = toks[1].strip() @@ -74,10 +80,16 @@ def text_to_edtf(text: str) -> Optional[str]: # match looks from the beginning of the string, search # looks anywhere. - if re.match(ONE_DIGIT_PARTIAL_FIRST, d2): # 1-digit year partial e.g. 1868-9 - if re.search(PARTIAL_CHECK, d1): # TODO: evaluate it and see if it's a year + if re.match( + ONE_DIGIT_PARTIAL_FIRST, d2 + ): # 1-digit year partial e.g. 1868-9 + if re.search( + PARTIAL_CHECK, d1 + ): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(TWO_DIGIT_PARTIAL_FIRST, d2): # 2-digit year partial e.g. 1809-10 + elif re.match( + TWO_DIGIT_PARTIAL_FIRST, d2 + ): # 2-digit year partial e.g. 1809-10 if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: @@ -134,7 +146,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: return None t = text.lower() - result = '' + result: str = "" if re.match(REJECT_RULES, t): return None @@ -143,7 +155,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(MIGHT_BE_DECADE, r'\1', t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t) # detect approximation signifiers # a few 'circa' abbreviations just before the year @@ -151,7 +163,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: # the word 'circa' anywhere # detect uncertainty signifiers - t, is_uncertain = re.subn(UNCERTAIN_REPL, r'\1', t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms @@ -191,7 +203,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_1 + default=DEFAULT_DATE_1, ) dt2 = parse( @@ -199,14 +211,13 @@ def text_to_edtf_date(text: str) -> Optional[str]: dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_2 + default=DEFAULT_DATE_2, ) except ValueError: return None - if dt1.date() == DEFAULT_DATE_1.date() and \ - dt2.date() == DEFAULT_DATE_2.date(): + if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. return None @@ -223,12 +234,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): - result += 'x' + result += "x" elif i == 3 and is_decade: if mentions_year: - result += 'u' # year precision + result += "X" # year precision else: - result += 'x' # decade precision + result += "x" # decade precision elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default @@ -236,12 +247,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: else: # different values were produced, meaning that it's likely # a default. Use 'unspecified' - result += "u" + result += "X" # strip off unknown chars from end of string - except the first 4 for i in reversed(range(len(result))): - if result[i] not in ('u', 'x', '-'): + if result[i] not in ("X", "-"): smallest_length = 4 if mentions_month: @@ -265,14 +276,16 @@ def text_to_edtf_date(text: str) -> Optional[str]: # end dateutil post-parsing - if is_uncertain: - result += "?" - - if is_approximate: - result += "~" + if is_uncertain and is_approximate: + result += "%" + else: + if is_uncertain: + result += "?" + if is_approximate: + result += "~" # weed out bad parses - if result.startswith("uu-uu"): + if result.startswith("XX-XX"): return None return result From 1aa53cfb2d4e0a2a3c284ec20db60f841b88a7f9 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 15:03:16 +0200 Subject: [PATCH 11/21] Update imports --- edtf/natlang/en.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 191199e..ba192e8 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,12 +1,12 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" import functools +import re from datetime import datetime from typing import Optional from dateutil.parser import parse -import re -from edtf import appsettings +from edtf import appsettings # two dates where every digit of an ISO date representation is different, # and one is in the past and one is in the future. From 8c4f9685bc31224bcd0efcf811485f2e3f34e292 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 16:48:01 +0200 Subject: [PATCH 12/21] Merge fixes --- edtf/natlang/en.py | 18 ++++++++++-------- edtf/parser/parser_classes.py | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index ba192e8..49b04f3 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Optional -from dateutil.parser import parse +from dateutil.parser import ParserError, parse from edtf import appsettings @@ -126,9 +126,9 @@ def text_to_edtf(text: str) -> Optional[str]: is_after = re.findall(AFTER_CHECK, t) if is_before: - result = f"unknown/{result}" + result = f"/{result}" elif is_after: - result = f"{result}/unknown" + result = f"{result}/" return result @@ -172,7 +172,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: # detect CE/BCE year form is_ce = re.findall(CE_RE, t) if is_century: - result = "%02dxx" % (int(is_century[0][0]) - 1,) + result = "%02dXX" % (int(is_century[0][0]) - 1,) is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) @@ -214,8 +214,10 @@ def text_to_edtf_date(text: str) -> Optional[str]: default=DEFAULT_DATE_2, ) - except ValueError: - return None + except ParserError: + return + except Exception: + return if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. @@ -234,12 +236,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): - result += "x" + result += "X" elif i == 3 and is_decade: if mentions_year: result += "X" # year precision else: - result += "x" # decade precision + result += "X" # decade precision elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index d103660..eada1f9 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -4,6 +4,7 @@ from datetime import date, datetime from operator import add, sub from time import struct_time +from typing import Optional from dateutil.relativedelta import relativedelta From 6f08bce95cb583f2825353cbe8ae6a1de1c47df7 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 16:55:59 +0200 Subject: [PATCH 13/21] ruff formatting --- edtf/natlang/en.py | 5 +++-- edtf/parser/parser_classes.py | 9 ++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 49b04f3..97230db 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,4 +1,5 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" + import functools import re from datetime import datetime @@ -52,7 +53,7 @@ REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -@functools.lru_cache() +@functools.lru_cache def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -133,7 +134,7 @@ def text_to_edtf(text: str) -> Optional[str]: return result -@functools.lru_cache() +@functools.lru_cache def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index eada1f9..ad690fb 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -98,10 +98,6 @@ class EDTFObject: parser = None - def __init__(self, *args, **kwargs): - errmsg: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" - raise NotImplementedError(f"{errmsg} is not implemented.") - @classmethod def set_parser(cls, p): cls.parser = p @@ -288,6 +284,7 @@ def set_year(self, y: int): def get_year(self) -> int: return self._year + year = property(get_year, set_year) def set_month(self, m: Optional[int]): @@ -297,6 +294,7 @@ def set_month(self, m: Optional[int]): def get_month(self) -> Optional[int]: return self._month + month = property(get_month, set_month) def __str__(self): @@ -932,8 +930,9 @@ def __str__(self): return result - def set_year(self, y): # Year can be None. + def set_year(self, y): # Year can be None. self._year = y + year = property(Date.get_year, set_year) def _precise_year(self, lean: str): From 973ccf4cabcd21cc0d7af5e2d1c8bb86992c65e3 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 17:27:15 +0200 Subject: [PATCH 14/21] Remove accidentally committed poetry file --- poetry.lock | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 poetry.lock diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index c4b40b6..0000000 --- a/poetry.lock +++ /dev/null @@ -1,45 +0,0 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. - -[[package]] -name = "pyparsing" -version = "3.1.2" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -optional = false -python-versions = ">=3.6.8" -files = [ - {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, - {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, -] - -[package.extras] -diagrams = ["jinja2", "railroad-diagrams"] - -[[package]] -name = "python-dateutil" -version = "2.9.0.post0" -description = "Extensions to the standard Python datetime module" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" -files = [ - {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, - {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, -] - -[package.dependencies] -six = ">=1.5" - -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] - -[metadata] -lock-version = "2.0" -python-versions = "^3.9" -content-hash = "e6be32f86f1a6af0695f6846b57ed289e015b5634c7f574c45800095a84e2200" From ee450a55a74069daf44da6c476a823dc879f6e78 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Wed, 14 Aug 2024 13:14:07 +0200 Subject: [PATCH 15/21] Fixed: f-string formatting Also added Andrew Hankinson to the authors list in pyproject.toml --- edtf/natlang/en.py | 2 +- pyproject.toml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 97230db..d57bb82 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -104,7 +104,7 @@ def text_to_edtf(text: str) -> Optional[str]: r2 = text_to_edtf_date(d2) if r1 and r2: - result = r1 + "/" + r2 + result = f"{r1}/{r2}" return result # is it an either/or year "1838/1862" - that has a different diff --git a/pyproject.toml b/pyproject.toml index b48c3f7..2d050c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,8 @@ authors = [ { name = "Mark Finger" }, { name = "Sabine Müller" }, { name = "Cole Crawford" }, - { name = "Klaus Rettinghaus" } + { name = "Klaus Rettinghaus" }, + { name = "Andrew Hankinson", email = "andrew.hankinson@rism.digital" }, ] maintainers = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au" } From 46bdce6bd97956088e932ba1ca359bac71ca3f06 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:40:50 +0200 Subject: [PATCH 16/21] Fixed: return type of statement --- edtf/parser/parser_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index ad690fb..c334ee9 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -28,7 +28,7 @@ PRECISION_DAY = "day" -def days_in_month(year: int, month: int) -> dict: +def days_in_month(year: int, month: int) -> int: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by From 656f8ad900ddd3d02ead2fce2eb9575c7d049025 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:44:38 +0200 Subject: [PATCH 17/21] Updated parser classes I've had a pass at the Parser Classes file, but there are a lot of problems still to be sorted out. I've added return types and argument types whereever it makes sense. The "UncertainOrApproximate" class is a hot mess. There are boolean values with property and method calls associated with them, and I would be surprised if it actually works. However, it doesn't seem to be tested or implemented, so I can't figure out where to go from here. --- edtf/parser/parser_classes.py | 192 +++++++++++++++++----------------- 1 file changed, 94 insertions(+), 98 deletions(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index c334ee9..eb9fac5 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -126,7 +126,7 @@ def __init__(self, *args, **kwargs): def __str__(self): raise NotImplementedError - def _strict_date(self, lean: str): + def _strict_date(self, lean: str = EARLIEST): raise NotImplementedError def lower_strict(self): @@ -141,30 +141,31 @@ def _get_fuzzy_padding(self, lean: str): """ return relativedelta(0) - def get_is_approximate(self): + def get_is_approximate(self) -> bool: return getattr(self, "_is_approximate", False) - def set_is_approximate(self, val): + def set_is_approximate(self, val: bool) -> None: self._is_approximate = val - is_approximate = property(get_is_approximate, set_is_approximate) + is_approximate = property(get_is_approximate, set_is_approximate) # noqa - def get_is_uncertain(self): + def get_is_uncertain(self) -> bool: return getattr(self, "_is_uncertain", False) - def set_is_uncertain(self, val): + def set_is_uncertain(self, val: bool) -> None: self._is_uncertain = val - is_uncertain = property(get_is_uncertain, set_is_uncertain) + is_uncertain = property(get_is_uncertain, set_is_uncertain) # noqa - def get_is_uncertain_and_approximate(self): + def get_is_uncertain_and_approximate(self) -> bool: return getattr(self, "_uncertain_and_approximate", False) - def set_is_uncertain_and_approximate(self, val): + def set_is_uncertain_and_approximate(self, val: bool) -> None: self._uncertain_and_approximate = val is_uncertain_and_approximate = property( - get_is_uncertain_and_approximate, set_is_uncertain_and_approximate + get_is_uncertain_and_approximate, # noqa + set_is_uncertain_and_approximate, # noqa ) def lower_fuzzy(self): @@ -242,76 +243,71 @@ def __le__(self, other): class Date(EDTFObject): - def set_year(self, y): - if y is None: - raise AttributeError("Year must not be None") - self._year = y - - def get_year(self): - return self._year - - year = property(get_year, set_year) - - def set_month(self, m): - self._month = m - if m is None: - self.day = None - - def get_month(self): - return self._month - - month = property(get_month, set_month) - - def __init__( - self, year=None, month=None, day=None, significant_digits=None, **kwargs + def __init__( # noqa + self, + year: Optional[str] = None, + month: Optional[str] = None, + day: Optional[str] = None, + significant_digits=None, + **kwargs, ): for param in ("date", "lower", "upper"): if param in kwargs: self.__init__(**kwargs[param]) return - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day + self._year = year # Year is required, but sometimes passed in as a 'date' dict. + self._month = month + self._day = day self.significant_digits = ( int(significant_digits) if significant_digits else None ) - def set_year(self, y: int): + def set_year(self, y: str): if y is None: raise AttributeError("Year must not be None") self._year = y - def get_year(self) -> int: + def get_year(self) -> str: return self._year - year = property(get_year, set_year) + year = property(get_year, set_year) # noqa - def set_month(self, m: Optional[int]): + def set_month(self, m: Optional[str]): self._month = m if m is None: - self.day = None + self._day = None - def get_month(self) -> Optional[int]: + def get_month(self) -> Optional[str]: return self._month - month = property(get_month, set_month) + month = property(get_month, set_month) # noqa + + def set_day(self, d: Optional[str]): + self._day = d + if d is None: + self._day = None + + def get_day(self) -> Optional[str]: + return self._day + + day = property(get_day, set_day) # noqa def __str__(self): - r = self.year - if self.month: - r += f"-{self.month}" - if self.day: - r += f"-{self.day}" + r = self._year + if self._month: + r += f"-{self._month}" + if self._day: + r += f"-{self._day}" if self.significant_digits: r += f"S{self.significant_digits}" return r def isoformat(self, default=date.max): return "%s-%02d-%02d" % ( - self.year, - int(self.month or default.month), - int(self.day or default.day), + self._year, + int(self._month or default.month), + int(self._day or default.day), ) def lower_fuzzy(self): @@ -320,10 +316,10 @@ def lower_fuzzy(self): sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) ) else: - total_digits = len(self.year) + total_digits = len(self._year) insignificant_digits = total_digits - self.significant_digits lower_year = ( - int(self.year) + int(self._year) // (10**insignificant_digits) * (10**insignificant_digits) ) @@ -335,9 +331,9 @@ def upper_fuzzy(self): add, self.upper_strict(), self._get_fuzzy_padding(LATEST) ) else: - total_digits = len(self.year) + total_digits = len(self._year) insignificant_digits = total_digits - self.significant_digits - upper_year = (int(self.year) // (10**insignificant_digits) + 1) * ( + upper_year = (int(self._year) // (10**insignificant_digits) + 1) * ( 10**insignificant_digits ) - 1 return struct_time( @@ -347,23 +343,23 @@ def upper_fuzzy(self): def _precise_year(self, lean): # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: - return int(re.sub(r"X", r"0", self.year)) + return int(re.sub(r"X", r"0", self._year)) else: - return int(re.sub(r"X", r"9", self.year)) + return int(re.sub(r"X", r"9", self._year)) def _precise_month(self, lean): - if self.month and self.month != "XX": + if self._month and self._month != "XX": try: - return int(self.month) + return int(self._month) except ValueError as err: raise ValueError( - f"Couldn't convert {self.month} to int (in {self})" + f"Couldn't convert {self._month} to int (in {self})" ) from err else: return 1 if lean == EARLIEST else 12 def _precise_day(self, lean): - if not self.day or self.day == "XX": + if not self._day or self._day == "XX": if lean == EARLIEST: return 1 else: @@ -371,9 +367,9 @@ def _precise_day(self, lean): self._precise_year(LATEST), self._precise_month(LATEST) ) else: - return int(self.day) + return int(self._day) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): """ Return a `time.struct_time` representation of the date. """ @@ -389,9 +385,9 @@ def _strict_date(self, lean): @property def precision(self): - if self.day: + if self._day: return PRECISION_DAY - if self.month: + if self._month: return PRECISION_MONTH return PRECISION_YEAR @@ -400,7 +396,7 @@ def estimated(self): class DateAndTime(EDTFObject): - def __init__(self, date, time): + def __init__(self, date, time): # noqa: super raises not implemented self.date = date self.time = time @@ -410,7 +406,7 @@ def __str__(self): def isoformat(self): return self.date.isoformat() + "T" + self.time - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): return self.date._strict_date(lean) def __eq__(self, other): @@ -429,14 +425,14 @@ def __ne__(self, other): class Interval(EDTFObject): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa: super() raises not implemented self.lower = lower self.upper = upper def __str__(self): return f"{self.lower}/{self.upper}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): if lean == EARLIEST: r = self.lower._strict_date(lean) else: @@ -459,7 +455,7 @@ def parse_action(cls, toks): args = toks.asList() return cls(*args) - def __init__(self, *args): + def __init__(self, *args): # noqa: super() raises not implemented if len(args) != 1: raise AssertionError("UA must have exactly one argument") ua = args[0] @@ -488,7 +484,7 @@ def _get_multiplier(self): class UncertainOrApproximate(EDTFObject): - def __init__(self, date, ua): + def __init__(self, date, ua): # noqa: super() raises not implemented self.date = date self.ua = ua self.is_uncertain = ua.is_uncertain if ua else False @@ -503,7 +499,7 @@ def __str__(self): else: return str(self.date) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): @@ -532,7 +528,7 @@ def _get_fuzzy_padding(self, lean): class UnspecifiedIntervalSection(EDTFObject): - def __init__(self, sectionOpen=False, other_section_element=None): + def __init__(self, sectionOpen=False, other_section_element=None): # noqa: super() raises not implemented if sectionOpen: self.is_open = True self.is_unknown = False @@ -547,14 +543,17 @@ def __str__(self): else: return ".." - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): + if lean not in (EARLIEST, LATEST): + raise ValueError("lean must be one of EARLIEST or LATEST") + if lean == EARLIEST: if self.is_unknown: upper = self.other._strict_date(LATEST) return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) else: return -math.inf - else: + elif lean == LATEST: if self.is_unknown: lower = self.other._strict_date(EARLIEST) return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) @@ -717,7 +716,7 @@ def precision(self): class Level1Interval(Interval): - def __init__(self, lower=None, upper=None): + def __init__(self, lower: Optional[dict] = None, upper: Optional[dict] = None): # noqa if lower: if lower["date"] == "..": self.lower = UnspecifiedIntervalSection( @@ -740,8 +739,10 @@ def __init__(self, lower=None, upper=None): self.upper = UnspecifiedIntervalSection( False, UncertainOrApproximate(**lower) ) - self.is_approximate = self.lower.is_approximate or self.upper.is_approximate - self.is_uncertain = self.lower.is_uncertain or self.upper.is_uncertain + self.is_approximate: bool = ( + self.lower.is_approximate or self.upper.is_approximate + ) + self.is_uncertain: bool = self.lower.is_uncertain or self.upper.is_uncertain self.is_uncertain_and_approximate = ( self.lower.is_uncertain_and_approximate or self.upper.is_uncertain_and_approximate @@ -755,7 +756,7 @@ def _get_fuzzy_padding(self, lean): class LongYear(EDTFObject): - def __init__(self, year, significant_digits=None): + def __init__(self, year: str, significant_digits: Optional[str] = None): # noqa self.year = year self.significant_digits = ( int(significant_digits) if significant_digits else None @@ -770,7 +771,7 @@ def __str__(self): def _precise_year(self): return int(self.year) - def _strict_date(self, lean: str): + def _strict_date(self, lean: str = EARLIEST): py = self._precise_year() if lean == EARLIEST: return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) @@ -818,7 +819,7 @@ def upper_fuzzy(self): class Season(Date): - def __init__(self, year, season, **kwargs): + def __init__(self, year, season, **kwargs): # noqa self.year = year self.season = season # use season to look up month # day isn't part of the 'season' spec, but it helps the inherited @@ -840,12 +841,7 @@ def _precise_month(self, lean): class PartialUncertainOrApproximate(Date): - def set_year(self, y): # Year can be None. - self._year = y - - year = property(Date.get_year, set_year) - - def __init__( + def __init__( # noqa self, year=None, month=None, @@ -933,7 +929,7 @@ def __str__(self): def set_year(self, y): # Year can be None. self._year = y - year = property(Date.get_year, set_year) + year = property(Date.get_year, set_year) # noqa def _precise_year(self, lean: str): if self.season: @@ -1018,7 +1014,7 @@ class PartialUnspecified(Unspecified): class Consecutives(Interval): # Treating Consecutive ranges as intervals where one bound is optional - def __init__(self, lower=None, upper=None): + def __init__(self, lower=None, upper=None): # noqa if lower and not isinstance(lower, EDTFObject): self.lower = Date.parse(lower) else: @@ -1044,7 +1040,7 @@ def __str__(self): class OneOfASet(EDTFObject): - def __init__(self, *args): + def __init__(self, *args): # noqa self.objects = args @classmethod @@ -1053,9 +1049,9 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return "[{}]".format(", ".join([str(o) for o in self.objects])) + return f"[{", ".join([str(o) for o in self.objects])}]" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): strict_dates = [x._strict_date(lean) for x in self.objects] # Accounting for possible 'inf' and '-inf' values if lean == LATEST: @@ -1077,7 +1073,7 @@ def _strict_date(self, lean): class MultipleDates(EDTFObject): - def __init__(self, *args): + def __init__(self, *args): # noqa self.objects = args @classmethod @@ -1086,16 +1082,16 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return "{{{}}}".format(", ".join([str(o) for o in self.objects])) + return f"{{{", ".join([str(o) for o in self.objects])}}}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) return min([x._strict_date(lean) for x in self.objects]) class Level2Interval(Level1Interval): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a # bug in the grammar that provides us with single-item lists of @@ -1122,7 +1118,7 @@ class Level2Season(Season): class ExponentialYear(LongYear): - def __init__(self, base, exponent, significant_digits=None): + def __init__(self, base, exponent, significant_digits=None): # noqa self.base = base self.exponent = exponent self.significant_digits = ( @@ -1132,13 +1128,13 @@ def __init__(self, base, exponent, significant_digits=None): def _precise_year(self): return int(self.base) * 10 ** int(self.exponent) - def get_year(self): + def get_year(self) -> str: if self.significant_digits: return f"{self.base}E{self.exponent}S{self.significant_digits}" else: return f"{self.base}E{self.exponent}" - year = property(get_year) + year = property(get_year) # noqa def estimated(self): return self._precise_year() From add79bd311c2af7698043deff7f992535cb22aed Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:45:39 +0200 Subject: [PATCH 18/21] Fixed: Remove SHORT_YEAR_RE This wasn't actually used anywhere! Also removed a redundant regex group --- edtf/natlang/en.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index d57bb82..9cee578 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -15,7 +15,6 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])") LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") @@ -27,7 +26,7 @@ BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") APPROX_CHECK = re.compile( - r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)" + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|^~" ) UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") UNCERTAIN_REPL = re.compile(r"(\d{4})\?") From fee0b648e2344169aeee2b35068c670afc7325a7 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:50:51 +0200 Subject: [PATCH 19/21] Problem with f-string --- edtf/parser/parser_classes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index eb9fac5..0334738 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1049,7 +1049,8 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return f"[{", ".join([str(o) for o in self.objects])}]" + repr: str = ", ".join([str(o) for o in self.objects]) + return f"[{repr}]" def _strict_date(self, lean: str = EARLIEST): strict_dates = [x._strict_date(lean) for x in self.objects] From 89f36924adf59d271aadc3df6ac3ea1454ccb093 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:59:23 +0200 Subject: [PATCH 20/21] Another f-string fix --- edtf/parser/parser_classes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index 0334738..14728f0 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1083,7 +1083,8 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return f"{{{", ".join([str(o) for o in self.objects])}}}" + repr: str = ", ".join([str(o) for o in self.objects]) + return f"{{{repr}}}" def _strict_date(self, lean: str = EARLIEST): if lean == LATEST: From 9da1d94436e124a337fd81133cee5ac48b85cea5 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 16:29:46 +0200 Subject: [PATCH 21/21] Fixed: pyproject errors --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 2d050c2..8826b99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,8 @@ [project] name = "edtf" version = "5.0.0" +license = { file = "LICENSE" } +keywords = ['edtf'] dependencies = [ "python-dateutil", "pyparsing",