From 796c43c7fce797ed304d78ca18aedf3fbbae8b03 Mon Sep 17 00:00:00 2001 From: Casper van der Wel Date: Mon, 19 Aug 2024 14:44:50 +0200 Subject: [PATCH] Pandas 2.1 and py 3.12 compat (#123) --- .github/workflows/test-conda.yaml | 4 ++-- .github/workflows/test.yml | 10 ++++----- CHANGES.rst | 8 ++++--- dask_geomodeling/geometry/field_operations.py | 21 ++++++++++++++----- dask_geomodeling/raster/sources.py | 18 +++++++++------- dask_geomodeling/raster/temporal.py | 19 +++++++++-------- dask_geomodeling/tests/test_geometry_sinks.py | 1 + setup.py | 12 +++++------ 8 files changed, 56 insertions(+), 37 deletions(-) diff --git a/.github/workflows/test-conda.yaml b/.github/workflows/test-conda.yaml index 78995c9..6949018 100644 --- a/.github/workflows/test-conda.yaml +++ b/.github/workflows/test-conda.yaml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-12, windows-latest] - python: ["3.9", "3.10"] + python: ["3.9", "3.12"] steps: - uses: actions/checkout@v2 @@ -29,7 +29,7 @@ jobs: - name: Setup Environment shell: bash run: | - conda create --name test python=${{ matrix.python }} pytest numpy=1.* gdal=3.* scipy pytz dask-core toolz "pandas<2.2" geopandas=0.* "pyproj>=2" + conda create --name test python=${{ matrix.python }} pytest numpy=1.* gdal=3.* scipy pytz dask-core toolz "pandas<2.2" geopandas "pyproj>=2" fiona source activate test python -V conda info diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2576b3f..b0d6942 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,10 +15,6 @@ jobs: fail-fast: false matrix: include: - - os: ubuntu-20.04 - python: 3.7 - numpy: "==1.16.*" - pins: "pygdal==3.0.4.* scipy==1.3.* dask[delayed]==1.* pandas==0.25.* geopandas==0.7.*" - os: ubuntu-20.04 python: 3.8 numpy: "==1.18.*" @@ -38,7 +34,11 @@ jobs: - os: ubuntu-22.04 python: "3.11" numpy: "==1.*" - pins: "pygdal==3.4.1.* geopandas==0.* pandas==2.1.*" + pins: "pygdal==3.4.1.* scipy==1.11.* dask[delayed]==2023.* pandas==2.0.* geopandas==0.*" + - os: ubuntu-22.04 + python: "3.12" + numpy: "==1.*" + pins: "pygdal==3.4.1.*" steps: - uses: actions/checkout@v2 diff --git a/CHANGES.rst b/CHANGES.rst index 3dadb79..0554b65 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,10 +6,12 @@ Changelog of dask-geomodeling - Fixed warnings when reprojecting geometries with geopandas >= 0.9. -- Added mandatory `temporal` attribute for RasterBlock. +- Fixed compatbility and solved deprecation warning swith pandas 2.1. + Still incompatible with pandas >=2.2. + +- Added version constraint showing incompatibility with numpy 2. -- Added version constraints showing incompatibility with geopandas 1.*, - pandas 2.2, and numpy 2. +- Added mandatory `temporal` attribute for RasterBlock. 2.4.4 (2024-01-17) diff --git a/dask_geomodeling/geometry/field_operations.py b/dask_geomodeling/geometry/field_operations.py index f6899c9..8c1d4d0 100644 --- a/dask_geomodeling/geometry/field_operations.py +++ b/dask_geomodeling/geometry/field_operations.py @@ -32,6 +32,19 @@ ] +def _none_to_nan(series: pd.Series) -> pd.Series: + """Put NaN in place of None in a Series + + If the series has only nones, downcasts type to float.""" + if series.dtype == object: + nones = series.isna() + if nones.all(): # downcasts type to float: + return pd.Series(index=series.index, name=series.name, dtype=float) + else: + series=series.copy() + series[nones] = np.nan + return series + class Classify(BaseSingleSeries): """ Classify a value column into different bins @@ -106,12 +119,11 @@ def right(self): return self.args[3] @staticmethod - def process(series, bins, labels, right): + def process(series: pd.Series, bins, labels, right): open_bounds = len(labels) == len(bins) + 1 if open_bounds: bins = np.concatenate([[-np.inf], bins, [np.inf]]) - if series.dtype == object: - series = series.fillna(value=np.nan) + series = _none_to_nan(series) result = pd.cut(series, bins, right, labels) # Transform from categorical to whatever suits the "labels". The @@ -207,8 +219,7 @@ def process(data, value_column, bin_columns, labels, right): return pd.Series([], dtype=float) features = data["features"] series = features[value_column] - if series.dtype == object: - series = series.fillna(value=np.nan) + series = _none_to_nan(series) values = series.values bins = features[bin_columns].values n_bins = len(bin_columns) diff --git a/dask_geomodeling/raster/sources.py b/dask_geomodeling/raster/sources.py index 225287e..a2ae171 100644 --- a/dask_geomodeling/raster/sources.py +++ b/dask_geomodeling/raster/sources.py @@ -5,7 +5,7 @@ from osgeo import gdal, gdal_array -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from dask_geomodeling import utils @@ -14,6 +14,10 @@ __all__ = ["MemorySource", "RasterFileSource"] +def utc_from_ms_timestamp(timestamp): + """Returns naive UTC datetime from ms timestamp""" + return datetime.fromtimestamp(timestamp / 1000, tz=timezone.utc).replace(tzinfo=None) + class MemorySource(RasterBlock): """A raster source that interfaces data from memory. @@ -174,9 +178,9 @@ def period(self): if len(self) == 0: return elif len(self) == 1: - return (datetime.utcfromtimestamp(self.time_first / 1000),) * 2 + return (utc_from_ms_timestamp(self.time_first),) * 2 else: - first = datetime.utcfromtimestamp(self.time_first / 1000) + first = utc_from_ms_timestamp(self.time_first) last = first + (len(self) - 1) * self.timedelta return first, last @@ -206,7 +210,7 @@ def get_sources_and_requests(self, **request): start, stop, first_i, last_i = utils.snap_start_stop( request.get("start"), request.get("stop"), - datetime.utcfromtimestamp(self.time_first / 1000), + utc_from_ms_timestamp(self.time_first), self.timedelta, len(self), ) @@ -398,9 +402,9 @@ def period(self): if len(self) == 0: return elif len(self) == 1: - return (datetime.utcfromtimestamp(self.time_first / 1000),) * 2 + return (utc_from_ms_timestamp(self.time_first)) * 2 else: - first = datetime.utcfromtimestamp(self.time_first / 1000) + first = utc_from_ms_timestamp(self.time_first) last = first + (len(self) - 1) * self.timedelta return first, last @@ -427,7 +431,7 @@ def get_sources_and_requests(self, **request): start, stop, first_i, last_i = utils.snap_start_stop( request.get("start"), request.get("stop"), - datetime.utcfromtimestamp(self.time_first / 1000), + utc_from_ms_timestamp(self.time_first), self.timedelta, len(self), ) diff --git a/dask_geomodeling/raster/temporal.py b/dask_geomodeling/raster/temporal.py index abbfb93..0b267a5 100644 --- a/dask_geomodeling/raster/temporal.py +++ b/dask_geomodeling/raster/temporal.py @@ -271,20 +271,20 @@ def _get_bin_label(dt, frequency, closed, label, timezone): # while there is only 1 sample here, there might be multiple (empty) bins # in some cases (see test_issue_5917) series = pd.Series([0], index=[_dt_to_ts(dt, timezone)]) - for label, bin in series.resample(frequency, closed=closed, label=label, kind="timestamp"): + for label, bin in series.resample(frequency, closed=closed, label=label): if len(bin) != 0: break return _ts_to_dt(label, timezone) -def _get_bin_period(dt, frequency, closed, label, timezone): - """Returns the label of the bin the input dt belongs to. +def _get_bin_start(dt, frequency, closed, label, timezone): + """Returns the start (left side) of the bin the input dt belongs to. :type dt: datetime.datetime without timezone. """ # go through resample, this is the only function that supports 'closed' series = pd.Series([0], index=[_dt_to_ts(dt, timezone)]) - resampled = series.resample(frequency, closed=closed, label=label, kind="period") + resampled = series.resample(frequency, closed=closed, label="left") return resampled.first().index[0] @@ -488,6 +488,7 @@ def __init__( if not isinstance(frequency, str): raise TypeError("'{}' object is not allowed.".format(type(frequency))) frequency = to_offset(frequency).freqstr + if closed not in {None, "left", "right"}: raise ValueError("closed must be None, 'left', or 'right'.") if label not in {None, "left", "right"}: @@ -552,9 +553,11 @@ def period(self): @property def timedelta(self): + if self.frequency is None: + return None try: - return to_offset(self.frequency).delta - except AttributeError: + return pd.Timedelta(to_offset(self.frequency)).to_pytimedelta() + except ValueError: return # e.g. Month is non-equidistant @property @@ -830,10 +833,8 @@ def get_sources_and_requests(self, **request): request["start"] = self.period[0] request["stop"] = stop else: - start_period = _get_bin_period(start, **kwargs) - # snap request 'start' to the start of the first period - request["start"] = _ts_to_dt(start_period.start_time, self.timezone) + request["start"] = _ts_to_dt(_get_bin_start(start, **kwargs), self.timezone) # snap request 'stop' to the last requested time request["stop"] = stop if kwargs["closed"] != "left": diff --git a/dask_geomodeling/tests/test_geometry_sinks.py b/dask_geomodeling/tests/test_geometry_sinks.py index 7398d76..a1d9deb 100644 --- a/dask_geomodeling/tests/test_geometry_sinks.py +++ b/dask_geomodeling/tests/test_geometry_sinks.py @@ -26,6 +26,7 @@ def assert_frame_equal_ignore_index(actual, expected, sort_col): actual.set_index(sort_col).sort_index(), expected.set_index(sort_col).sort_index(), check_like=True, + check_index_type=False ) diff --git a/setup.py b/setup.py index cf3d02c..c7a6175 100644 --- a/setup.py +++ b/setup.py @@ -7,12 +7,12 @@ install_requires = ( [ - "dask[delayed]>=0.20", - "pandas>=0.23,<=2.2", - "geopandas>=0.7,<1", + "dask[delayed]>=2.9", + "pandas>=1.0,<2.2", + "geopandas>=0.8", "pytz", - "numpy>=1.14,<2", - "scipy>=1.1", + "numpy>=1.18,<2", + "scipy>=1.4", "fiona" ], ) @@ -53,7 +53,7 @@ zip_safe=False, install_requires=install_requires, tests_require=tests_require, - python_requires='>=3.7', + python_requires='>=3.8', extras_require={"test": tests_require, "cityhash": ["cityhash"]}, entry_points={"console_scripts": []}, )