Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance and move ISO-8601 parser to coding.times #9899

Merged
merged 21 commits into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ Internal Changes
within ``as_compatible_data``. This is consistent with how lists of these objects
will be converted (:pull:`9900`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.


.. _whats-new.2024.11.0:

Expand Down
17 changes: 16 additions & 1 deletion properties/test_encode_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,22 @@

"""

import warnings

import pytest

pytest.importorskip("hypothesis")
# isort: split

import hypothesis.extra.numpy as npst
import hypothesis.strategies as st
import numpy as np
from hypothesis import given

import xarray as xr
from xarray.testing.strategies import variables
from xarray.coding.times import _parse_iso8601
from xarray.testing.strategies import CFTimeStrategyISO8601, variables
from xarray.tests import requires_cftime


@pytest.mark.slow
Expand Down Expand Up @@ -43,3 +48,13 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None:
coder = xr.coding.variables.CFScaleOffsetCoder()
roundtripped = coder.decode(coder.encode(original))
xr.testing.assert_identical(original, roundtripped)


@requires_cftime
@given(dt=st.datetimes() | CFTimeStrategyISO8601())
def test_iso8601_decode(dt):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
iso = dt.isoformat()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
parsed, _ = _parse_iso8601(type(dt), iso)
assert dt == parsed
5 changes: 3 additions & 2 deletions xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@
import pandas as pd
from packaging.version import Version

from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso
from xarray.coding.cftimeindex import CFTimeIndex
from xarray.coding.times import (
_is_standard_calendar,
_parse_iso8601,
_should_cftime_be_used,
convert_time_or_go_back,
format_cftime_datetime,
Expand Down Expand Up @@ -843,7 +844,7 @@ def to_cftime_datetime(date_str_or_date, calendar=None):
"If converting a string to a cftime.datetime object, "
"a calendar type must be provided"
)
date, _ = _parse_iso8601_with_reso(get_date_type(calendar), date_str_or_date)
date, _ = _parse_iso8601(get_date_type(calendar), date_str_or_date)
return date
elif isinstance(date_str_or_date, cftime.datetime):
return date_str_or_date
Expand Down
82 changes: 6 additions & 76 deletions xarray/coding/cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from __future__ import annotations

import math
import re
import warnings
from datetime import timedelta
from typing import TYPE_CHECKING, Any
Expand All @@ -53,6 +52,7 @@

from xarray.coding.times import (
_STANDARD_CALENDARS,
_parse_iso8601,
cftime_to_nptime,
infer_calendar_name,
)
Expand All @@ -78,71 +78,6 @@
OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,)


def named(name, pattern):
return "(?P<" + name + ">" + pattern + ")"


def optional(x):
return "(?:" + x + ")?"


def trailing_optional(xs):
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
pieces = [
(None, "year", r"\d{4}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string):
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601_with_reso(date_type, timestr):
_ = attempt_import("cftime")
headtr1ck marked this conversation as resolved.
Show resolved Hide resolved

default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
if attr == "microsecond":
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
resolution = attr
return default.replace(**replace), resolution


def _parsed_string_to_bounds(date_type, resolution, parsed):
"""Generalization of
pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds
Expand Down Expand Up @@ -436,7 +371,7 @@ def _partial_date_slice(self, resolution, parsed):

def _get_string_slice(self, key):
"""Adapted from pandas.tseries.index.DatetimeIndex._get_string_slice"""
parsed, resolution = _parse_iso8601_with_reso(self.date_type, key)
parsed, resolution = _parse_iso8601(self.date_type, key)
try:
loc = self._partial_date_slice(resolution, parsed)
except KeyError as err:
Expand Down Expand Up @@ -483,7 +418,7 @@ def _maybe_cast_slice_bound(self, label, side):
if not isinstance(label, str):
return label

parsed, resolution = _parse_iso8601_with_reso(self.date_type, label)
parsed, resolution = _parse_iso8601(self.date_type, label)
start, end = _parsed_string_to_bounds(self.date_type, resolution, parsed)
if self.is_monotonic_decreasing and len(self) > 1:
return end if side == "left" else start
Expand Down Expand Up @@ -811,11 +746,6 @@ def is_leap_year(self):
return func(self.year, calendar=self.calendar)


def _parse_iso8601_without_reso(date_type, datetime_str):
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
return date


def _parse_array_of_cftime_strings(strings, date_type):
"""Create a numpy array from an array of strings.

Expand All @@ -833,9 +763,9 @@ def _parse_array_of_cftime_strings(strings, date_type):
-------
np.array
"""
return np.array(
[_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()]
).reshape(strings.shape)
return np.array([_parse_iso8601(date_type, s)[0] for s in strings.ravel()]).reshape(
strings.shape
)


def _contains_datetime_timedeltas(array):
Expand Down
70 changes: 70 additions & 0 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,76 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def named(name: str, pattern: str) -> str:
return "(?P<" + name + ">" + pattern + ")"


def optional(x: str) -> str:
return "(?:" + x + ")?"


def trailing_optional(xs: list[str]) -> str:
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(
date_sep: str = r"\-",
datetime_sep: str = r"T",
time_sep: str = r"\:",
micro_sep: str = r".",
) -> str:
pieces = [
(None, "year", r"[+-]?\d{4,5}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string: str) -> dict[str, str | None]:
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601(date_type, timestr):
default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
resolution = attr
if attr == "microsecond":
if len(value) <= 3:
resolution = "millisecond"
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
return default.replace(**replace), resolution


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
Expand Down
35 changes: 35 additions & 0 deletions xarray/testing/strategies.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import datetime
import warnings
from collections.abc import Hashable, Iterable, Mapping, Sequence
from typing import TYPE_CHECKING, Any, Protocol, overload

Expand Down Expand Up @@ -473,3 +475,36 @@ def unique_subset_of(
return (
{k: objs[k] for k in subset_keys} if isinstance(objs, Mapping) else subset_keys
)


class CFTimeStategy(st.SearchStrategy):
def __init__(self, min_value, max_value):
self.min_value = min_value
self.max_value = max_value

def do_draw(self, data):
unit_microsecond = datetime.timedelta(microseconds=1)
timespan_microseconds = (self.max_value - self.min_value) // unit_microsecond
result = data.draw_integer(0, timespan_microseconds)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
return self.min_value + datetime.timedelta(microseconds=result)


class CFTimeStrategyISO8601(st.SearchStrategy):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@spencerkclark very nice!

I think the idiomatic way is to use composite but we can do that later.

def __init__(self):
from xarray.tests.test_coding_times import _all_cftime_date_types

self.date_types = _all_cftime_date_types()
self.calendars = list(self.date_types)

def do_draw(self, data):
calendar = data.draw(st.sampled_from(self.calendars))
date_type = self.date_types[calendar]
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
daysinmonth = date_type(99999, 12, 1).daysinmonth
min_value = date_type(-99999, 1, 1)
max_value = date_type(99999, 12, daysinmonth, 23, 59, 59, 999999)
strategy = CFTimeStategy(min_value, max_value)
return strategy.do_draw(data)
36 changes: 28 additions & 8 deletions xarray/tests/test_cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
from xarray.coding.cftimeindex import (
CFTimeIndex,
_parse_array_of_cftime_strings,
_parse_iso8601_with_reso,
_parsed_string_to_bounds,
assert_all_valid_date_type,
)
from xarray.coding.times import (
_parse_iso8601,
parse_iso8601_like,
)
from xarray.tests import (
Expand Down Expand Up @@ -132,16 +134,34 @@ def date_dict(
list(ISO8601_LIKE_STRING_TESTS.values()),
ids=list(ISO8601_LIKE_STRING_TESTS.keys()),
)
def test_parse_iso8601_like(string, expected):
result = parse_iso8601_like(string)
@pytest.mark.parametrize(
"five_digit_year", [False, True], ids=["four-digit-year", "five-digit-year"]
)
@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"])
def test_parse_iso8601_like(
five_digit_year: bool, sign: str, string: str, expected: dict
) -> None:
pre = "1" if five_digit_year else ""
datestring = sign + pre + string
result = parse_iso8601_like(datestring)
expected = expected.copy()
expected.update(year=sign + pre + expected["year"])
assert result == expected

if result["microsecond"] is None:
# check malformed single digit addendum
# this check is only performed when we have at least "hour" given
# like "1999010101", where a single added digit should raise
# for "1999" (year), "199901" (month) and "19990101" (day)
# and a single added digit the string would just be interpreted
# as having a 5-digit year.
if result["microsecond"] is None and result["hour"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + "3")
if result["second"] is None:
parse_iso8601_like(datestring + "3")

# check malformed floating point addendum
if result["second"] is None or result["microsecond"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + ".3")
parse_iso8601_like(datestring + ".3")


_CFTIME_CALENDARS = [
Expand Down Expand Up @@ -348,7 +368,7 @@ def test_cftimeindex_days_in_month_accessor(index):
def test_parse_iso8601_with_reso(date_type, string, date_args, reso):
expected_date = date_type(*date_args)
expected_reso = reso
result_date, result_reso = _parse_iso8601_with_reso(date_type, string)
result_date, result_reso = _parse_iso8601(date_type, string)
assert result_date == expected_date
assert result_reso == expected_reso

Expand Down
Loading