Skip to content

Commit

Permalink
Merge pull request #45 from artshumrc/pytest-format-tests
Browse files Browse the repository at this point in the history
Reformat tests to use Pytest style; fix infinite comparison for OneOfASet
  • Loading branch information
aweakley authored May 10, 2024
2 parents 7ba63ad + 46b7b8f commit 98636f7
Show file tree
Hide file tree
Showing 5 changed files with 308 additions and 425 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
Expand Down
115 changes: 51 additions & 64 deletions edtf/natlang/tests.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,30 @@
import unittest
import pytest
from edtf.natlang.en import text_to_edtf

# TODO update the tests and code to test and output the new spec

# where examples are tuples, the second item is the normalised output
EXAMPLES = (
('active late 17th-19th centuries', '16xx/18xx'), # ignoring 'late' for now
('active 17-19th Centuries', '16xx/18xx'), # ignoring 'late' for now
@pytest.mark.parametrize("input_text,expected_output", [
# Ignoring 'late' for simplicity in these examples
('active late 17th-19th centuries', '16xx/18xx'),
('active 17-19th Centuries', '16xx/18xx'),
# Unrecognised values
('', None),
('this isn\'t a date', None),
# Explicity rejected values that would otherwise be badly converted
# Explicitly rejected values that would otherwise be badly converted
('23rd Dynasty', None),
('90', '1990'), # implied century
# Implied century and specific years
('90', '1990'), # Implied century
('1860', '1860'),
('the year 1800', '1800'),
('the year 1897', '1897'),
('January 2008', '2008-01'),
('January 12, 1940', '1940-01-12'),
# uncertain/approximate
# Uncertain or approximate dates
('1860?', '1860?'),
('1862 (uncertain)', '1862?'),
('maybe 1862', '1862?'),
Expand All @@ -31,11 +35,11 @@
('~ Feb 1812', '1812-02~'),
('circa Feb 1812', '1812-02~'),
('Feb 1812 approx', '1812-02~'),
('c1860', '1860~'), # different abbreviations
('c.1860', '1860~'), # with or without .
('c1860', '1860~'), # Different abbreviations
('c.1860', '1860~'), # With or without .
('ca1860', '1860~'),
('ca.1860', '1860~'),
('c 1860', '1860~'), # with or without space
('c 1860', '1860~'), # With or without space
('c. 1860', '1860~'),
('ca. 1860', '1860~'),
('approx 1860', '1860~'),
Expand All @@ -44,75 +48,72 @@
('approximately 1860', '1860~'),
('about 1860', '1860~'),
('about Spring 1849', '1849-21~'),
('notcirca 1860', '1860'), # avoid words containing circa
('attica 1802', '1802'),
# avoid false positive circa at the end of preceding word
('attic. 1802', '1802'), # avoid false positive circa
('notcirca 1860', '1860'), # Avoid words containing 'circa'
('attica 1802', '1802'), # Avoid false positive 'circa' at the end of preceding word
('attic. 1802', '1802'), # Avoid false positive 'circa'
# masked precision
('1860s', '186x'), # 186x has decade precision, 186u has year precision.
# Masked precision
('1860s', '186x'), # 186x has decade precision, 186u has year precision.
# masked precision + uncertainty
# Masked precision + uncertainty
('ca. 1860s', '186x~'),
('c. 1860s', '186x~'),
('Circa 1840s', '184x~'),
('circa 1840s', '184x~'),
('ca. 1860s?', '186x?~'),
('uncertain: approx 1862', '1862?~'),
# masked precision with first decade (ambiguous)
('1800s', '18xx'), # without additional uncertainty, use the century
('2000s', '20xx'), # without additional uncertainty, use the century
('c1900s', '190x~'), # if there's additional uncertainty, use the decade
('c1800s?', '180x?~'), # if there's additional uncertainty, use the decade
# Ambiguous masked precision for centuries and decades
('1800s', '18xx'), # Without additional uncertainty, use the century
('2000s', '20xx'), # Without additional uncertainty, use the century
('c1900s', '190x~'), # If there's additional uncertainty, use the decade
('c1800s?', '180x?~'), # If there's additional uncertainty, use the decade
# unspecified
# Unspecified dates
('January 12', 'uuuu-01-12'),
('January', 'uuuu-01'),
('10/7/2008', '2008-10-07'),
('7/2008', '2008-07'),
# seasons
# Seasons mapped to specific codes
('Spring 1872', '1872-21'),
('Summer 1872', '1872-22'),
('Autumn 1872', '1872-23'),
('Fall 1872', '1872-23'),
('Winter 1872', '1872-24'),
# before/after
# Dates relative to known events (before/after)
('earlier than 1928', 'unknown/1928'),
('before 1928', 'unknown/1928'),
('after 1928', '1928/unknown'),
('later than 1928', '1928/unknown'),
('before January 1928', 'unknown/1928-01'),
('before 18 January 1928', 'unknown/1928-01-18'),
# before/after approx
# Approximations combined with before/after
('before approx January 18 1928', 'unknown/1928-01-18~'),
('before approx January 1928', 'unknown/1928-01~'),
('after approx January 1928', '1928-01~/unknown'),
('after approx Summer 1928', '1928-22~/unknown'),
# before/after and uncertain/unspecificed
# Before and after with uncertain / unspecified components
('after about the 1920s', '192x~/unknown'),
('before about the 1900s', 'unknown/190x~'),
('before the 1900s', 'unknown/19xx'),
# unspecified
# Specifying unspecified components within a date
# ('decade in 1800s', '18ux'), #too esoteric
# ('decade somewhere during the 1800s', '18ux'), #lengthier. Keywords are 'in' or 'during'
('year in the 1860s', '186u'),
# 186x has decade precision, 186u has year precision.
('year in the 1800s', '18xu'),
('year in the 1860s', '186u'), # 186x has decade precision
('year in the 1800s', '18xu'), # 186u has year precision
('year in about the 1800s', '180u~'),
('month in 1872', '1872-uu'),
('day in Spring 1849', '1849-21-uu'),
('day in January 1872', '1872-01-uu'),
('day in 1872', '1872-uu-uu'),
('birthday in 1872', '1872'),
# avoid false positive at end of preceding word
# centuries
# Handling centuries with approximation and uncertainty
('1st century', '00xx'),
('10c', '09xx'),
('19th century', '18xx'),
Expand All @@ -126,7 +127,7 @@
('19c?', '18xx?'),
('c.19c?', '18xx?~'),
# BC/AD
# BC/AD dating
('1 AD', '0001'),
('17 CE', '0017'),
('127 CE', '0127'),
Expand All @@ -136,18 +137,17 @@
('c127 CE', '0127~'),
('c1270 CE', '1270~'),
('c64 BCE', '-0064~'),
('2nd century bc', '-01xx'), # -200 to -101
('2nd century bc', '-01xx'), # -200 to -101
('2nd century bce', '-01xx'),
('2nd century ad', '01xx'),
('2nd century ce', '01xx'),
# c-c-c-combo
# just showing off now...
# Combining uncertainties and approximations in creative ways
('a day in about Spring 1849?', '1849-21-uu?~'),
# simple ranges. Not all of these results are correct EDTF, but
# this is as good as the EDTF implementation and simple natural
# language parser we have.
# Simple date ranges, showcasing both the limitations and capabilities of the parser
# Not all of these results are correct EDTF, but this is as good as the EDTF implementation
# and simple natural language parser we have.
('1851-1852', '1851/1852'),
('1851-1852; printed 1853-1854', '1851/1852'),
('1851-52', '1851/1852'),
Expand All @@ -156,7 +156,6 @@
('1857-mid 1860s', '1857/186x'),
('1858/1860', '[1858, 1860]'),
('1860s-1870s', '186x/187x'),
('1861, printed 1869', '1861'),
('1910-30', '1910/1930'),
('active 1910-30', '1910/1930'),
('1861-67', '1861/1867'),
Expand All @@ -174,16 +173,13 @@
('1900; 1973', '1900'),
('1900; printed 1912', '1900'),
('1915 late - autumn 1916', '1915/1916-23'),

('1915, from Camerawork, October 1916', '1915'), # should be {1915, 1916-10}
('1915, from Camerawork, October 1916', '1915'), # should be {1915, 1916-10}
('1920s -early 1930s', '192x/193x'),
('1930s, printed early 1960s', '193x'), # should be something like {193x, 196x},
# though those forms aren't explicitly supported in the spec.
('1932, printed 1976 by Gunther Sander', '1932'), # should be {1932, 1976}
('1938, printed 1940s-1950s', '1938'), # should be something like {1938, 194x-195x}


('1938, printed 1940s-1950s', '1938') # should be something like {1938, 194x-195x}
# Uncertain and approximate on different parts of the date
# for these to work we need to recast is_uncertain and is_approximate
# such that they work on different parts. Probably worth rolling our own
# dateparser at this point.
Expand All @@ -194,22 +190,13 @@
# ('a day in about Spring in about 1849', '1849~-21~-uu'),
# ('maybe January in some year in about the 1830s', '183u~-01?'),
# ('about July? in about 1849', '1849~-07?~'),
)


class TestLevel0(unittest.TestCase):
def test_natlang(self):
"""
For each of the examples, establish that:
- the unicode of the parsed object is acceptably equal to the EDTF string
- the parsed object is a subclass of EDTFObject
:return:
"""
for i, o in EXAMPLES:
e = text_to_edtf(i)
print("%s => %s" % (i, e))
self.assertEqual(e, o)
])

def test_natlang(input_text, expected_output):
"""
Test natural language conversion to EDTF format:
Verify that the conversion from text to EDTF format matches the expected output.
"""
result = text_to_edtf(input_text)
assert result == expected_output, f"Failed for input: {input_text}"

if __name__ == '__main__':
unittest.main()
12 changes: 10 additions & 2 deletions edtf/parser/parser_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,10 +713,18 @@ def __str__(self):
return "[%s]" % (", ".join([str(o) for o in self.objects]))

def _strict_date(self, lean):
strict_dates = [x._strict_date(lean) for x in self.objects]
# Accounting for possible 'inf' and '-inf' values
if lean == LATEST:
return max([x._strict_date(lean) for x in self.objects])
if any(isinstance(d, float) and d == float('inf') for d in strict_dates):
return float('inf')
else:
return max((d for d in strict_dates if not isinstance(d, float)), default=float('inf'))
else:
return min([x._strict_date(lean) for x in self.objects])
if any(isinstance(d, float) and d == float('-inf') for d in strict_dates):
return float('-inf')
else:
return min((d for d in strict_dates if not isinstance(d, float)), default=float('-inf'))


class MultipleDates(EDTFObject):
Expand Down
Loading

0 comments on commit 98636f7

Please sign in to comment.