From d2270753dc5dc75b1d7c7bc38647cf51ba937147 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 26 Apr 2022 09:07:11 -0600 Subject: [PATCH 01/44] Handle numeric_only in xarray --- flox/xarray.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/flox/xarray.py b/flox/xarray.py index e115b7f9d..87fbd78d7 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -223,11 +223,6 @@ def xarray_reduce( else: dim = tuple() - # TODO: do this for specific reductions only - bad_dtypes = tuple( - k for k in ds.variables if k not in ds.dims and ds[k].dtype.kind in ("S", "U") - ) - # broadcast all variables against each other along all dimensions in `by` variables # don't exclude `dim` because it need not be a dimension in any of the `by` variables! # in the case where dim is Ellipsis, and by.ndim < obj.ndim @@ -303,8 +298,6 @@ def wrapper(array, *by, func, skipna, **kwargs): if isinstance(obj, xr.Dataset): # broadcasting means the group dim gets added to ds, so we check the original obj for k, v in obj.data_vars.items(): - if k in bad_dtypes: - continue is_missing_dim = not (any(d in v.dims for d in dim)) if is_missing_dim: missing_dim[k] = v @@ -314,7 +307,7 @@ def wrapper(array, *by, func, skipna, **kwargs): actual = xr.apply_ufunc( wrapper, - ds.drop_vars(tuple(missing_dim) + bad_dtypes).transpose(..., *grouper_dims), + ds.drop_vars(tuple(missing_dim)).transpose(..., *grouper_dims), *by, input_core_dims=input_core_dims, # for xarray's test_groupby_duplicate_coordinate_labels From 14796bf6db07170c4979371616d8b235c7dabe46 Mon Sep 17 00:00:00 2001 From: dcherian Date: Wed, 27 Apr 2022 08:36:15 -0600 Subject: [PATCH 02/44] Add dependabot --- .github/dependabot.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..8ac6b8c49 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" From 9dcdd7899d2cd18408ef0843cd48b8edd66055f6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Apr 2022 14:36:52 +0000 Subject: [PATCH 03/44] Bump actions/checkout from 2 to 3 Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/ci-additional.yaml | 4 ++-- .github/workflows/ci.yaml | 4 ++-- .github/workflows/pypi.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 7224f79aa..9c1b31b2b 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -20,7 +20,7 @@ jobs: outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 2 - uses: xarray-contrib/ci-trigger@v1.1 @@ -47,7 +47,7 @@ jobs: "minimal-requirements", ] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 # Fetch all history for all branches and tags. diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f45961c9d..8cd0853fb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,7 +23,7 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.7", "3.8", "3.9"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set environment variables @@ -76,7 +76,7 @@ jobs: # runs-on: ubuntu-latest # steps: # - name: Checkout - # uses: actions/checkout@v2 + # uses: actions/checkout@v3 # - name: set up Python 3.8 # uses: actions/setup-python@v2 # with: diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 492d49649..bcfe26495 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -8,7 +8,7 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v2 with: From d944447525b53b879bccaf16e15f6587f58d9426 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Apr 2022 14:36:50 +0000 Subject: [PATCH 04/44] Bump actions/setup-python from 2 to 3 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 2 to 3. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/pypi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index bcfe26495..db4ffd715 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -10,7 +10,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: '3.x' - name: Install dependencies From e1d253333e3842c33f7a48b61bcacfcb4c7cb2dc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Apr 2022 08:43:02 -0600 Subject: [PATCH 05/44] Bump codecov/codecov-action from 2.1.0 to 3.1.0 (#86) Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 2 +- .github/workflows/ci.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 9c1b31b2b..6a2b5ab40 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -96,7 +96,7 @@ jobs: --cov-report=xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v2.1.0 + uses: codecov/codecov-action@v3.1.0 with: file: ./coverage.xml flags: unittests,${{ matrix.env }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8cd0853fb..e448f31a5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -63,7 +63,7 @@ jobs: pytest -n auto --cov=./ --cov-report=xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v3.1.0 with: file: ./coverage.xml flags: unittests From 69182739f808b562f5bfc82739d7c95eb46ce2b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Apr 2022 08:43:38 -0600 Subject: [PATCH 06/44] Bump actions/cache from 2 to 3 (#87) Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 2 +- .github/workflows/ci.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 6a2b5ab40..e3ea111a3 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -56,7 +56,7 @@ jobs: echo "CONDA_ENV_FILE=ci/${{ matrix.env }}.yml" >> $GITHUB_ENV - name: Cache conda - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/conda_pkgs_dir key: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e448f31a5..33d98393e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,7 +31,7 @@ jobs: echo "CONDA_ENV_FILE=ci/environment.yml" >> $GITHUB_ENV echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - name: Cache conda - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/conda_pkgs_dir key: From 5798990c446ffa6093d024b5fd8376aa73c83c2b Mon Sep 17 00:00:00 2001 From: dcherian Date: Wed, 27 Apr 2022 08:44:32 -0600 Subject: [PATCH 07/44] Use GH concurrency setting --- .github/workflows/cancel-duplicate-runs.yaml | 14 -------------- .github/workflows/ci.yaml | 4 ++++ 2 files changed, 4 insertions(+), 14 deletions(-) delete mode 100644 .github/workflows/cancel-duplicate-runs.yaml diff --git a/.github/workflows/cancel-duplicate-runs.yaml b/.github/workflows/cancel-duplicate-runs.yaml deleted file mode 100644 index 8463d9590..000000000 --- a/.github/workflows/cancel-duplicate-runs.yaml +++ /dev/null @@ -1,14 +0,0 @@ -name: Cancel -on: - workflow_run: - workflows: ["CI"] - types: - - requested -jobs: - cancel: - name: Cancel previous runs - runs-on: ubuntu-latest - steps: - - uses: styfle/cancel-workflow-action@0.8.0 - with: - workflow_id: ${{ github.event.workflow.id }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 33d98393e..93618505b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,6 +10,10 @@ on: - cron: "0 0 * * *" # Daily “At 00:00” workflow_dispatch: # allows you to trigger manually +concurrency: + group: ${{ github.ref }} + cancel-in-progress: true + jobs: build: name: Build (${{ matrix.python-version }}, ${{ matrix.os }}) From cb640f9030463b3e39c6a0b232fbf0ce18dc4260 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 29 Apr 2022 22:54:25 -0600 Subject: [PATCH 08/44] Fixes for latest numpy_groupies (#85) * Remove nanvar, nanstd compatibility code * Remove argreduction compatibility code * Fix argreduction along axis with single chunk Fix argreductions * Update min numpy_groupies version * Restore tests * Fix numpy tests. * Skip nanagr* instead of xfail --- ci/environment.yml | 2 +- ci/minimal-requirements.yml | 2 +- ci/no-dask.yml | 2 +- ci/no-xarray.yml | 2 +- flox/aggregations.py | 14 +++++- flox/core.py | 97 ++++++++++++++++++------------------- setup.cfg | 2 +- tests/test_core.py | 6 +-- 8 files changed, 67 insertions(+), 60 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index cfb44d257..bbaf5ded6 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -13,7 +13,7 @@ dependencies: - pytest-xdist - xarray - pre-commit - - numpy_groupies + - numpy_groupies>=0.9.15 - pooch - toolz - numba diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml index 18975f625..81b483e74 100644 --- a/ci/minimal-requirements.yml +++ b/ci/minimal-requirements.yml @@ -8,7 +8,7 @@ dependencies: - pytest - pytest-cov - pytest-xdist - - numpy_groupies + - numpy_groupies>=0.9.15 - pandas - pooch - toolz diff --git a/ci/no-dask.yml b/ci/no-dask.yml index 3a8bcec5b..698297918 100644 --- a/ci/no-dask.yml +++ b/ci/no-dask.yml @@ -12,7 +12,7 @@ dependencies: - xarray - numpydoc - pre-commit - - numpy_groupies + - numpy_groupies>=0.9.15 - pooch - toolz - numba diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml index c9affa643..6e54d8f4b 100644 --- a/ci/no-xarray.yml +++ b/ci/no-xarray.yml @@ -12,7 +12,7 @@ dependencies: - dask-core - numpydoc - pre-commit - - numpy_groupies + - numpy_groupies>=0.9.15 - pooch - toolz - numba diff --git a/flox/aggregations.py b/flox/aggregations.py index 280e36613..c97c97477 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -9,6 +9,14 @@ from . import aggregate_flox, aggregate_npg, xrdtypes as dtypes, xrutils +def _is_arg_reduction(func: str | Aggregation) -> bool: + if isinstance(func, str) and func in ["argmin", "argmax", "nanargmax", "nanargmin"]: + return True + if isinstance(func, Aggregation) and func.reduction_type == "argreduce": + return True + return False + + def generic_aggregate( group_idx, array, @@ -488,7 +496,11 @@ def _initialize_aggregation( agg.fill_value[func] = _get_fill_value(agg.dtype[func], agg.fill_value[func]) fv = fill_value if fill_value is not None else agg.fill_value[agg.name] - agg.fill_value["numpy"] = (fv,) + if _is_arg_reduction(agg): + # this allows us to unravel_index easily. we have to do that nearly every time. + agg.fill_value["numpy"] = (0,) + else: + agg.fill_value["numpy"] = (fv,) if finalize_kwargs is not None: assert isinstance(finalize_kwargs, dict) diff --git a/flox/core.py b/flox/core.py index f4ecc031d..f0a9598ab 100644 --- a/flox/core.py +++ b/flox/core.py @@ -509,6 +509,7 @@ def chunk_argreduce( dask.array.reductions.argtopk """ array, idx = array_plus_idx + by = np.broadcast_to(by, array.shape) results = chunk_reduce( array, @@ -522,10 +523,13 @@ def chunk_argreduce( sort=sort, ) if not isnull(results["groups"]).all(): - # will not work for empty groups... - # glorious idx = np.broadcast_to(idx, array.shape) + + # array, by get flattened to 1D before passing to npg + # so the indexes need to be unraveled newidx = np.unravel_index(results["intermediates"][1], array.shape) + + # Now index into the actual "global" indexes `idx` results["intermediates"][1] = idx[newidx] if reindex and expected_groups is not None: @@ -533,6 +537,8 @@ def chunk_argreduce( results["intermediates"][1], results["groups"].squeeze(), expected_groups, fill_value=0 ) + assert results["intermediates"][0].shape == results["intermediates"][1].shape + return results @@ -879,34 +885,45 @@ def _grouped_combine( array_idx = tuple( _conc2(x_chunk, key1="intermediates", key2=idx, axis=axis) for idx in (0, 1) ) - results = chunk_argreduce( - array_idx, - groups, - func=agg.combine[slicer], # count gets treated specially next - axis=axis, - expected_groups=None, - fill_value=agg.fill_value["intermediate"][slicer], - dtype=agg.dtype["intermediate"][slicer], - engine=engine, - sort=sort, - ) + + # for a single element along axis, we don't want to run the argreduction twice + # This happens when we are reducing along an axis with a single chunk. + avoid_reduction = array_idx[0].shape[axis[0]] == 1 + if avoid_reduction: + results = {"groups": groups, "intermediates": list(array_idx)} + else: + results = chunk_argreduce( + array_idx, + groups, + func=agg.combine[slicer], # count gets treated specially next + axis=axis, + expected_groups=None, + fill_value=agg.fill_value["intermediate"][slicer], + dtype=agg.dtype["intermediate"][slicer], + engine=engine, + sort=sort, + ) if agg.chunk[-1] == "nanlen": counts = _conc2(x_chunk, key1="intermediates", key2=2, axis=axis) - # sum the counts - results["intermediates"].append( - chunk_reduce( - counts, - groups, - func="sum", - axis=axis, - expected_groups=None, - fill_value=(0,), - dtype=(np.intp,), - engine=engine, - sort=sort, - )["intermediates"][0] - ) + + if avoid_reduction: + results["intermediates"].append(counts) + else: + # sum the counts + results["intermediates"].append( + chunk_reduce( + counts, + groups, + func="sum", + axis=axis, + expected_groups=None, + fill_value=(0,), + dtype=(np.intp,), + engine=engine, + sort=sort, + )["intermediates"][0] + ) elif agg.reduction_type == "reduce": # Here we reduce the intermediates individually @@ -1006,24 +1023,7 @@ def _reduce_blockwise(array, by, agg, *, axis, expected_groups, fill_value, engi ) # type: ignore if _is_arg_reduction(agg): - if array.ndim > 1: - # default fill_value is -1; we can't unravel that; - # so replace -1 with 0; unravel; then replace 0 with -1 - # UGH! - idx = results["intermediates"][0] - mask = idx == agg.fill_value["numpy"][0] - idx[mask] = 0 - # Fix npg bug where argmax with nD array, 1D group_idx, axis=-1 - # will return wrong indices - idx = np.unravel_index(idx, array.shape)[-1] - idx[mask] = agg.fill_value["numpy"][0] - results["intermediates"][0] = idx - elif agg.name in ["nanvar", "nanstd"]: - # TODO: Fix npg bug where all-NaN rows are 0 instead of NaN - value, counts = results["intermediates"] - mask = counts <= 0 - value[mask] = np.nan - results["intermediates"][0] = value + results["intermediates"][0] = np.unravel_index(results["intermediates"][0], array.shape)[-1] result = _finalize_results( results, agg, axis, expected_groups, fill_value=fill_value, reindex=reindex @@ -1530,12 +1530,7 @@ def groupby_reduce( # The only way to do this consistently is mask out using min_count # Consider np.sum([np.nan]) = np.nan, np.nansum([np.nan]) = 0 if min_count is None: - if ( - len(axis) < by.ndim - or fill_value is not None - # TODO: Fix npg bug where all-NaN rows are 0 instead of NaN - or (not has_dask and isinstance(func, str) and func in ["nanvar", "nanstd"]) - ): + if len(axis) < by.ndim or fill_value is not None: min_count = 1 # TODO: set in xarray? diff --git a/setup.cfg b/setup.cfg index b7d4f5ce3..03a25bd82 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,7 +28,7 @@ include_package_data = True python_requires = >=3.7 install_requires = pandas - numpy_groupies + numpy_groupies >= '0.9.15' toolz importlib-metadata; python_version < '3.8' diff --git a/tests/test_core.py b/tests/test_core.py index f282432e8..d643140ea 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -53,9 +53,9 @@ def dask_array_ones(*args): "min", "nanmin", "argmax", - pytest.param("nanargmax", marks=(pytest.mark.xfail,)), + pytest.param("nanargmax", marks=(pytest.mark.skip,)), "argmin", - pytest.param("nanargmin", marks=(pytest.mark.xfail,)), + pytest.param("nanargmin", marks=(pytest.mark.skip,)), "any", "all", pytest.param("median", marks=(pytest.mark.skip,)), @@ -142,7 +142,7 @@ def gen_array_by(size, func): return array, by -@pytest.mark.parametrize("chunks", [None, 3, 4]) +@pytest.mark.parametrize("chunks", [None, -1, 3, 4]) @pytest.mark.parametrize("nby", [1, 2, 3]) @pytest.mark.parametrize("size", ((12,), (12, 9))) @pytest.mark.parametrize("add_nan_by", [True, False]) From f8da87cf98c9f67ccbfcabe629dc32971a7bfb45 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 29 Apr 2022 23:04:15 -0600 Subject: [PATCH 09/44] Drop py37 (#64) --- .github/workflows/ci-additional.yaml | 2 +- .github/workflows/ci.yaml | 27 +-------------------------- setup.cfg | 4 +--- 3 files changed, 3 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index e3ea111a3..a4dc9024b 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -70,7 +70,7 @@ jobs: mamba-version: "*" activate-environment: flox-tests auto-update-conda: false - python-version: 3.8 + python-version: 3.9 use-only-tar-bz2: true - name: Install conda dependencies diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 93618505b..804563c2f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,7 +25,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest"] - python-version: ["3.7", "3.8", "3.9"] + python-version: ["3.8", "3.10"] steps: - uses: actions/checkout@v3 with: @@ -74,28 +74,3 @@ jobs: env_vars: RUNNER_OS,PYTHON_VERSION name: codecov-umbrella fail_ci_if_error: false - - # docs-build: - # name: Documentation build - # runs-on: ubuntu-latest - # steps: - # - name: Checkout - # uses: actions/checkout@v3 - # - name: set up Python 3.8 - # uses: actions/setup-python@v2 - # with: - # python-version: 3.8 - - # - name: Install package - # run: | - # python -m pip install --upgrade pip - # python -m pip install -e . - - # - name: Install documentation dependencies - # run: | - # python -m pip install -r docs/requirements-docs.txt - - # - name: Build docs - # run: | - # cd docs - # make html diff --git a/setup.cfg b/setup.cfg index 03a25bd82..72790e8a6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,7 +15,6 @@ classifiers = Intended Audience :: Science/Research Programming Language :: Python Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 @@ -25,12 +24,11 @@ classifiers = packages = find: zip_safe = False # https://mypy.readthedocs.io/en/latest/installed_packages.html include_package_data = True -python_requires = >=3.7 +python_requires = >=3.8 install_requires = pandas numpy_groupies >= '0.9.15' toolz - importlib-metadata; python_version < '3.8' [options.extras_require] all = From af70b2af41061194ad3eef8a511348abb2307b0b Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 9 May 2022 23:08:19 +0200 Subject: [PATCH 10/44] Create py.typed (#92) --- flox/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 flox/py.typed diff --git a/flox/py.typed b/flox/py.typed new file mode 100644 index 000000000..e69de29bb From 2fdf98ae92575e5b1ffa4b789426857d26a1523b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 May 2022 21:12:17 -0600 Subject: [PATCH 11/44] Fix binning with "cohorts" & "split-reduce" (#94) --- flox/core.py | 21 +++++++++++++++------ tests/test_core.py | 7 ++++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/flox/core.py b/flox/core.py index f0a9598ab..a4aefe357 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1459,6 +1459,10 @@ def groupby_reduce( by: tuple = tuple(np.asarray(b) if not is_duck_array(b) else b for b in by) nby = len(by) by_is_dask = any(is_duck_dask_array(b) for b in by) + + if method in ["split-reduce", "cohorts"] and by_is_dask: + raise ValueError(f"method={method!r} can only be used when grouping by numpy arrays.") + if not is_duck_array(array): array = np.asarray(array) if isinstance(isbin, bool): @@ -1477,9 +1481,11 @@ def groupby_reduce( # (pd.IntervalIndex or not) expected_groups = _convert_expected_groups_to_index(expected_groups, isbin, sort) - # when grouping by multiple variables, we factorize early. # TODO: could restrict this to dask-only - if nby > 1: + factorize_early = (nby > 1) or ( + any(isbin) and method in ["split-reduce", "cohorts"] and is_duck_dask_array(array) + ) + if factorize_early: by, final_groups, grp_shape = _factorize_multiple( by, expected_groups, by_is_dask=by_is_dask ) @@ -1497,6 +1503,7 @@ def groupby_reduce( if method in ["blockwise", "cohorts", "split-reduce"] and len(axis) != by.ndim: raise NotImplementedError( "Must reduce along all dimensions of `by` when method != 'map-reduce'." + f"Received method={method!r}" ) # TODO: make sure expected_groups is unique @@ -1617,10 +1624,12 @@ def groupby_reduce( result = result[..., sorted_idx] groups = (groups[0][sorted_idx],) - if nby > 1: + if factorize_early: # nan group labels are factorized to -1, and preserved - # now we get rid of them - nanmask = groups[0] == -1 + # now we get rid of them by reindexing + # This also handles bins with no data + result = reindex_( + result, from_=groups[0], to=expected_groups, fill_value=fill_value + ).reshape(result.shape[:-1] + grp_shape) groups = final_groups - result = result[..., ~nanmask].reshape(result.shape[:-1] + grp_shape) return (result, *groups) diff --git a/tests/test_core.py b/tests/test_core.py index d643140ea..03dbfbae3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -582,12 +582,16 @@ def test_npg_nanarg_bug(func): assert_equal(actual, expected) +@pytest.mark.parametrize("method", ["split-reduce", "cohorts", "map-reduce"]) @pytest.mark.parametrize("chunk_labels", [False, True]) @pytest.mark.parametrize("chunks", ((), (1,), (2,))) -def test_groupby_bins(chunk_labels, chunks, engine) -> None: +def test_groupby_bins(chunk_labels, chunks, engine, method) -> None: array = [1, 1, 1, 1, 1, 1] labels = [0.2, 1.5, 1.9, 2, 3, 20] + if method in ["split-reduce", "cohorts"] and chunk_labels: + pytest.xfail() + if chunks: if not has_dask: pytest.skip() @@ -604,6 +608,7 @@ def test_groupby_bins(chunk_labels, chunks, engine) -> None: isbin=True, fill_value=0, engine=engine, + method=method, ) expected = np.array([3, 1, 0]) for left, right in zip(groups, pd.IntervalIndex.from_arrays([1, 2, 4], [2, 4, 5]).to_numpy()): From c34559949245278cfb312951ffd6916efac12a8d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 May 2022 21:54:07 -0600 Subject: [PATCH 12/44] Add upstream-dev test (#93) --- .github/workflows/ci.yaml | 23 +++++++++++++++++++++++ ci/upstream-dev-env.yml | 19 +++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 ci/upstream-dev-env.yml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 804563c2f..8813e4da4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -74,3 +74,26 @@ jobs: env_vars: RUNNER_OS,PYTHON_VERSION name: codecov-umbrella fail_ci_if_error: false + + upstream-dev: + name: upstream-dev + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: conda-incubator/setup-miniconda@v2 + with: + channels: conda-forge + mamba-version: "*" + activate-environment: flox-tests + auto-update-conda: false + python-version: '3.10' + - name: Set up conda environment + shell: bash -l {0} + run: | + mamba env update -f ci/upstream-dev-env.yml + python -m pip install -e . + conda list + - name: Run Tests + shell: bash -l {0} + run: | + pytest -n 2 diff --git a/ci/upstream-dev-env.yml b/ci/upstream-dev-env.yml new file mode 100644 index 000000000..b44dc45e7 --- /dev/null +++ b/ci/upstream-dev-env.yml @@ -0,0 +1,19 @@ +name: flox-tests +channels: + - conda-forge +dependencies: + - cachey + - codecov + - netcdf4 + - pooch + - toolz + - numba + - pytest + - pytest-cov + - pytest-xdist + - pip + - pip: + - git+https://github.com/pydata/xarray + - git+https://github.com/pandas-dev/pandas + - git+https://github.com/dask/dask + - git+https://github.com/ml31415/numpy-groupies From 592c46ba0bb859f732968b68426b6332caebc213 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 May 2022 12:46:04 -0600 Subject: [PATCH 13/44] Add hourly climatology story (#97) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/user-stories.md | 1 + .../user-stories/climatology-hourly.ipynb | 1976 +++++++++++++++++ .../user-stories/hourly-climatology.html | 88 + 3 files changed, 2065 insertions(+) create mode 100644 docs/source/user-stories/climatology-hourly.ipynb create mode 100644 docs/source/user-stories/hourly-climatology.html diff --git a/docs/source/user-stories.md b/docs/source/user-stories.md index 5b2148535..39ff7cd0a 100644 --- a/docs/source/user-stories.md +++ b/docs/source/user-stories.md @@ -5,5 +5,6 @@ :maxdepth: 1 user-stories/climatology.ipynb + user-stories/climatology-hourly.ipynb user-stories/custom-aggregations.ipynb ``` diff --git a/docs/source/user-stories/climatology-hourly.ipynb b/docs/source/user-stories/climatology-hourly.ipynb new file mode 100644 index 000000000..de145fab3 --- /dev/null +++ b/docs/source/user-stories/climatology-hourly.ipynb @@ -0,0 +1,1976 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "84e8bbee-90cc-4e6a-bf89-c56dc19c11ca", + "metadata": {}, + "source": [ + "# More climatology reductions\n", + "\n", + "This one is motivated by\n", + "[this Pangeo Discourse post](https://discourse.pangeo.io/t/dask-xarray-and-swap-memory-polution-on-local-linux-cluster/2453/5)\n", + "and follows\n", + "[this notebook](https://nbviewer.ipython.org/gist/fmaussion/95d1b9c9a3113db2f987b91e842cb8e0)\n", + "\n", + "The task is to compute an hourly climatology from an hourly dataset with 744\n", + "hours in each chunk.\n", + "\n", + "We choose the \"map-reduce\" strategy because:\n", + "\n", + "1. all hours (groups) are present in each chunk;\n", + "2. a groupby reduction applied blockwise will result in arrays of shape (X,\n", + " Y, 744) being reduced to (X, Y, 24) i.e. 744/24=31x decrease in chunk size,\n", + " so this should work well memory wise.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "727f490e-906a-4537-ac5e-3c67985cd6d5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dcherian/mambaforge/envs/dcpy/lib/python3.8/site-packages/distributed/node.py:180: UserWarning: Port 8787 is already in use.\n", + "Perhaps you already have a cluster running?\n", + "Hosting the HTTP server on port 51613 instead\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fc1dd8438def4d75acee8602c544248c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(HTML(value='
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (time: 8760, latitude: 721, longitude: 1440)\n",
+       "Coordinates:\n",
+       "  * time     (time) datetime64[ns] 2021-01-01 ... 2021-12-31T23:00:00\n",
+       "Dimensions without coordinates: latitude, longitude\n",
+       "Data variables:\n",
+       "    tp       (time, latitude, longitude) float32 dask.array<chunksize=(744, 50, 1440), meta=np.ndarray>
" + ], + "text/plain": [ + "\n", + "Dimensions: (time: 8760, latitude: 721, longitude: 1440)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2021-01-01 ... 2021-12-31T23:00:00\n", + "Dimensions without coordinates: latitude, longitude\n", + "Data variables:\n", + " tp (time, latitude, longitude) float32 dask.array" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = xr.Dataset(\n", + " {\n", + " \"tp\": (\n", + " (\"time\", \"latitude\", \"longitude\"),\n", + " dask.array.ones(\n", + " (8760, 721, 1440), chunks=(744, 50, 1440), dtype=np.float32\n", + " ),\n", + " )\n", + " },\n", + " coords={\"time\": pd.date_range(\"2021-01-01\", \"2021-12-31 23:59\", freq=\"H\")},\n", + ")\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "3a350782-b747-4e5e-8b8b-15fab72c0d2c", + "metadata": {}, + "source": [ + "Here's just plain xarray: 10000 tasks and one chunk per hour in the output\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ecc77698-5879-4b7c-ad97-891fb104d295", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'tp' (hour: 24, latitude: 721, longitude: 1440)>\n",
+       "dask.array<stack, shape=(24, 721, 1440), dtype=float32, chunksize=(1, 50, 1440), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "  * hour     (hour) int64 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23\n",
+       "Dimensions without coordinates: latitude, longitude
" + ], + "text/plain": [ + "\n", + "dask.array\n", + "Coordinates:\n", + " * hour (hour) int64 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23\n", + "Dimensions without coordinates: latitude, longitude" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.tp.groupby(\"time.hour\").mean()" + ] + }, + { + "cell_type": "markdown", + "id": "beccd9f8-ad62-4cd8-86cc-acfe14f13023", + "metadata": {}, + "source": [ + "And flox: 600 tasks and all hours in a single chunk\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0a3da8e5-863a-4602-9176-0a9adc689563", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'tp' (hour: 24, latitude: 721, longitude: 1440)>\n",
+       "dask.array<transpose, shape=(24, 721, 1440), dtype=float32, chunksize=(24, 50, 1440), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "  * hour     (hour) int64 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23\n",
+       "Dimensions without coordinates: latitude, longitude
" + ], + "text/plain": [ + "\n", + "dask.array\n", + "Coordinates:\n", + " * hour (hour) int64 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23\n", + "Dimensions without coordinates: latitude, longitude" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import flox.xarray\n", + "\n", + "hourly = flox.xarray.xarray_reduce(ds.tp, ds.time.dt.hour, func=\"mean\")\n", + "hourly" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8aa1a641-1ce1-4264-96dc-d11bb1d4ab57", + "metadata": {}, + "outputs": [], + "source": [ + "from distributed import performance_report" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e37c5aa2-c77a-4d87-8db4-5052c675c42d", + "metadata": {}, + "outputs": [], + "source": [ + "with performance_report(\"hourly-climatology.html\"):\n", + " hourly.compute()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "02bf99615dae4b7b9b2aac23acccc828": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "06093fd4131d42749c5d32b149d36cbe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "23d59a300993407dabc70ab6282460ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "2415e8902a9e4087827ebb98df678028": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "AccordionModel", + "state": { + "_titles": { + "0": "Manual Scaling", + "1": "Adaptive Scaling" + }, + "children": [ + "IPY_MODEL_fce763ee43d44833bfb73dc3ca34d18a", + "IPY_MODEL_5c81a669ef8d4e13921d9b6f3218fbe1" + ], + "layout": "IPY_MODEL_6e424b71aff3457baae281ef596e294a", + "selected_index": null + } + }, + "38ba388c3c144dd4af2d9487f9623f31": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": {} + }, + "3eb1fff965764a2aa70f35e59754a6e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "5bbcffef6cc04a6f893e5e8be12de433": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_02bf99615dae4b7b9b2aac23acccc828", + "style": "IPY_MODEL_726a881ed9644cd988b37c70dbe1957b", + "value": "
\n
\n
\n
\n

LocalCluster

\n

2b898a97

\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n \n
\n Dashboard: http://127.0.0.1:51613/status\n \n Workers: 4\n
\n Total threads: 4\n \n Total memory: 8.00 GiB\n
Status: runningUsing processes: True
\n\n
\n \n

Scheduler Info

\n
\n\n
\n
\n
\n
\n

Scheduler

\n

Scheduler-e88043e1-f96c-408b-828a-6133edf9383e

\n \n \n \n \n \n \n \n \n \n \n \n \n \n
\n Comm: tcp://127.0.0.1:51614\n \n Workers: 4\n
\n Dashboard: http://127.0.0.1:51613/status\n \n Total threads: 4\n
\n Started: 8 minutes ago\n \n Total memory: 8.00 GiB\n
\n
\n
\n\n
\n \n

Workers

\n
\n\n \n
\n
\n
\n
\n \n

Worker: 0

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51625\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51632/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51618\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-sha7f1ls\n
\n
\n
\n
\n \n
\n
\n
\n
\n \n

Worker: 1

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51626\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51639/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51619\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-o21y4jdf\n
\n
\n
\n
\n \n
\n
\n
\n
\n \n

Worker: 2

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51631\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51640/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51617\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-ll8d_5ds\n
\n
\n
\n
\n \n
\n
\n
\n
\n \n

Worker: 3

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51628\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51638/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51620\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-t_4kkml1\n
\n
\n
\n
\n \n\n
\n
\n\n
\n
\n
" + } + }, + "5c81a669ef8d4e13921d9b6f3218fbe1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_ebd6fbdbb6b149b8b71bc1adf4f98e8f", + "IPY_MODEL_e09755df3cd34c65adef354b74764926", + "IPY_MODEL_6d0480d5ac4243728c2e219060c4d160" + ], + "layout": "IPY_MODEL_b66ab102b9fc4ef69e3eb1a5a78f3211" + } + }, + "6a04758b6a5e4bbf8df42688a433ce7c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "6d0480d5ac4243728c2e219060c4d160": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "description": "Adapt", + "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", + "style": "IPY_MODEL_38ba388c3c144dd4af2d9487f9623f31" + } + }, + "6e424b71aff3457baae281ef596e294a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "min_width": "500px" + } + }, + "726a881ed9644cd988b37c70dbe1957b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7d9c070ca8c8451086d0d8f977c3769f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "91ba64ec63f74f7dbe2aa552c53368ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_987a6ab24d514f3f91c40bce527c23cc", + "IPY_MODEL_2415e8902a9e4087827ebb98df678028" + ], + "layout": "IPY_MODEL_6a04758b6a5e4bbf8df42688a433ce7c" + } + }, + "987a6ab24d514f3f91c40bce527c23cc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_9a05be3122214bcebda0395a1685bd18", + "style": "IPY_MODEL_d427f7f692f947b69e66bdf3a799ffe5", + "value": "\n \n \n \n
Scaling mode: Manual
Workers: 4
\n " + } + }, + "99676d05a3504002a88bbcfa7dca2ab7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": {} + }, + "9a05be3122214bcebda0395a1685bd18": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a6fbe3d8ce864b40ac7ff3ed9cc28ee2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Workers", + "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", + "step": 1, + "style": "IPY_MODEL_3eb1fff965764a2aa70f35e59754a6e5" + } + }, + "b30b5b56bbf24eea8658df01925e7cb9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "b66ab102b9fc4ef69e3eb1a5a78f3211": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d427f7f692f947b69e66bdf3a799ffe5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e09755df3cd34c65adef354b74764926": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Maximum", + "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", + "step": 1, + "style": "IPY_MODEL_23d59a300993407dabc70ab6282460ba" + } + }, + "ebd6fbdbb6b149b8b71bc1adf4f98e8f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Minimum", + "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", + "step": 1, + "style": "IPY_MODEL_7d9c070ca8c8451086d0d8f977c3769f" + } + }, + "f3fab32037ec4a8887a9d61036d93eed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "description": "Scale", + "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", + "style": "IPY_MODEL_99676d05a3504002a88bbcfa7dca2ab7" + } + }, + "f47c6dced9324bfca691f320e4697911": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "width": "150px" + } + }, + "fc1dd8438def4d75acee8602c544248c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "TabModel", + "state": { + "_titles": { + "0": "Status", + "1": "Scaling" + }, + "children": [ + "IPY_MODEL_5bbcffef6cc04a6f893e5e8be12de433", + "IPY_MODEL_91ba64ec63f74f7dbe2aa552c53368ee" + ], + "layout": "IPY_MODEL_b30b5b56bbf24eea8658df01925e7cb9" + } + }, + "fce763ee43d44833bfb73dc3ca34d18a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_a6fbe3d8ce864b40ac7ff3ed9cc28ee2", + "IPY_MODEL_f3fab32037ec4a8887a9d61036d93eed" + ], + "layout": "IPY_MODEL_06093fd4131d42749c5d32b149d36cbe" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/user-stories/hourly-climatology.html b/docs/source/user-stories/hourly-climatology.html new file mode 100644 index 000000000..fc78db18d --- /dev/null +++ b/docs/source/user-stories/hourly-climatology.html @@ -0,0 +1,88 @@ + + + + + + + + + + + Dask Performance Report + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + From 19543801fb28f2557591aecd559e52b66c8051b3 Mon Sep 17 00:00:00 2001 From: dcherian Date: Wed, 11 May 2022 12:50:47 -0600 Subject: [PATCH 14/44] Link to perf report. --- docs/source/user-stories/climatology-hourly.ipynb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/user-stories/climatology-hourly.ipynb b/docs/source/user-stories/climatology-hourly.ipynb index de145fab3..6342355b8 100644 --- a/docs/source/user-stories/climatology-hourly.ipynb +++ b/docs/source/user-stories/climatology-hourly.ipynb @@ -1695,6 +1695,14 @@ "with performance_report(\"hourly-climatology.html\"):\n", " hourly.compute()" ] + }, + { + "cell_type": "markdown", + "id": "494766c2-305a-4518-b2b7-a85bcc7fd5b2", + "metadata": {}, + "source": [ + "View the performance report [here](https://rawcdn.githack.com/dcherian/flox/592c46ba0bb859f732968b68426b6332caebc213/docs/source/user-stories/hourly-climatology.html)" + ] } ], "metadata": { From 1803f77b372a41c6838b129003cdfdb850b5f981 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 May 2022 20:04:17 -0600 Subject: [PATCH 15/44] Fix mixing of blockwise and map-reduce strategies. (#103) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/user-stories/climatology-hourly.ipynb | 3 ++- flox/core.py | 4 ++++ tests/test_core.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/source/user-stories/climatology-hourly.ipynb b/docs/source/user-stories/climatology-hourly.ipynb index 6342355b8..91f2cce9c 100644 --- a/docs/source/user-stories/climatology-hourly.ipynb +++ b/docs/source/user-stories/climatology-hourly.ipynb @@ -1701,7 +1701,8 @@ "id": "494766c2-305a-4518-b2b7-a85bcc7fd5b2", "metadata": {}, "source": [ - "View the performance report [here](https://rawcdn.githack.com/dcherian/flox/592c46ba0bb859f732968b68426b6332caebc213/docs/source/user-stories/hourly-climatology.html)" + "View the performance report\n", + "[here](https://rawcdn.githack.com/dcherian/flox/592c46ba0bb859f732968b68426b6332caebc213/docs/source/user-stories/hourly-climatology.html)\n" ] } ], diff --git a/flox/core.py b/flox/core.py index a4aefe357..189c3e080 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1583,6 +1583,10 @@ def groupby_reduce( array_subset = np.take(array_subset, idxr, axis=ax) numblocks = np.prod([len(array_subset.chunks[ax]) for ax in axis]) + # First deep copy becasue we might be doping blockwise, + # which sets agg.finalize=None, then map-reduce (GH102) + agg = copy.deepcopy(agg) + # get final result for these groups r, *g = partial_agg( array_subset, diff --git a/tests/test_core.py b/tests/test_core.py index 03dbfbae3..c55ee8d91 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -828,3 +828,17 @@ def test_datetime_binning(): expected = pd.cut(by, time_bins).codes.copy() expected[0] = 14 # factorize doesn't return -1 for nans assert_equal(group_idx, expected) + + +@requires_dask +def test_map_reduce_blockwise_mixed(): + t = pd.date_range("2000-01-01", "2000-12-31", freq="D").to_series() + data = t.dt.dayofyear + actual = groupby_reduce( + dask.array.from_array(data.values, chunks=365), + t.dt.month, + func="mean", + method="split-reduce", + ) + expected = groupby_reduce(data, t.dt.month, func="mean") + assert_equal(expected, actual) From 01b1afe1471e8a111df8994cdc7a83405e39b7b5 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 May 2022 20:24:31 -0600 Subject: [PATCH 16/44] Fix reductions on boolean reductions. (#104) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- flox/core.py | 2 ++ tests/test_core.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/flox/core.py b/flox/core.py index 189c3e080..008d3f483 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1465,6 +1465,8 @@ def groupby_reduce( if not is_duck_array(array): array = np.asarray(array) + array = array.astype(int) if np.issubdtype(array.dtype, bool) else array + if isinstance(isbin, bool): isbin = (isbin,) * len(by) if expected_groups is None: diff --git a/tests/test_core.py b/tests/test_core.py index c55ee8d91..71e3b9bed 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -830,6 +830,17 @@ def test_datetime_binning(): assert_equal(group_idx, expected) +@pytest.mark.parametrize("func", ALL_FUNCS) +def test_bool_reductions(func, engine): + if "arg" in func and engine == "flox": + pytest.skip() + groups = np.array([1, 1, 1]) + data = np.array([True, True, False]) + expected = np.expand_dims(getattr(np, func)(data), -1) + actual, _ = groupby_reduce(data, groups, func=func, engine=engine) + assert_equal(expected, actual) + + @requires_dask def test_map_reduce_blockwise_mixed(): t = pd.date_range("2000-01-01", "2000-12-31", freq="D").to_series() From 5b7edbe9fda13aa01fd6668d2772b2b8a219e98a Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 May 2022 20:42:49 -0600 Subject: [PATCH 17/44] Use isnull instead of isnan: engine="flox" (#105) --- flox/aggregate_flox.py | 8 +++++--- tests/test_xarray.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index 02683080f..fc84250e7 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -2,6 +2,8 @@ import numpy as np +from .xrutils import isnull + def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dtype=None, out=None): """ @@ -36,7 +38,7 @@ def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dt def _nan_grouped_op(group_idx, array, func, fillna, *args, **kwargs): - result = func(group_idx, np.where(np.isnan(array), fillna, array), *args, **kwargs) + result = func(group_idx, np.where(isnull(array), fillna, array), *args, **kwargs) # np.nanmax([np.nan, np.nan]) = np.nan # To recover this behaviour, we need to search for the fillna value # (either np.inf or -np.inf), and replace with NaN @@ -74,7 +76,7 @@ def sum_of_squares(group_idx, array, *, axis=-1, size=None, fill_value=None, dty def nansum_of_squares(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): return sum_of_squares( group_idx, - np.where(np.isnan(array), 0, array), + np.where(isnull(array), 0, array), size=size, fill_value=fill_value, axis=axis, @@ -83,7 +85,7 @@ def nansum_of_squares(group_idx, array, *, axis=-1, size=None, fill_value=None, def nanlen(group_idx, array, *args, **kwargs): - return sum(group_idx, (~np.isnan(array)).astype(int), *args, **kwargs) + return sum(group_idx, (~isnull(array)).astype(int), *args, **kwargs) def mean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 5b50d7c14..52a28e549 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -400,3 +400,16 @@ def test_cache(): xarray_reduce(ds, "labels", func="mean", method="blockwise") assert len(cache.data) == 2 + + +@pytest.mark.parametrize("use_cftime", [True, False]) +def test_datetime_array_reduce(use_cftime): + + time = xr.DataArray( + xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime), + dims=("time",), + name="time", + ) + expected = time.resample(time="YS").count() # fails + actual = resample_reduce(time.resample(time="YS"), func="count", engine="flox") + assert_equal(expected, actual) From 227ce041e9b78f8432c62370c9b2c46dd91fa657 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 May 2022 10:11:18 -0600 Subject: [PATCH 18/44] Fix grouping by multiindex (#106) --- flox/xarray.py | 24 +++++++++++++++--------- tests/test_xarray.py | 11 +++++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/flox/xarray.py b/flox/xarray.py index 87fbd78d7..95edb843c 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import xarray as xr +from packaging.version import Version from .aggregations import Aggregation, _atleast_1d from .core import ( @@ -345,12 +346,16 @@ def wrapper(array, *by, func, skipna, **kwargs): expect = expect.to_numpy() if isinstance(actual, xr.Dataset) and name in actual: actual = actual.drop_vars(name) - actual[name] = expect - - # if grouping by multi-indexed variable, then restore it - for name, index in ds.indexes.items(): - if name in actual.indexes and isinstance(index, pd.MultiIndex): - actual[name] = index + # When grouping by MultiIndex, expect is an pd.Index wrapping + # an object array of tuples + if name in ds.indexes and isinstance(ds.indexes[name], pd.MultiIndex): + levelnames = ds.indexes[name].names + expect = pd.MultiIndex.from_tuples(expect.values, names=levelnames) + actual[name] = expect + if Version(xr.__version__) > Version("2022.03.0"): + actual = actual.set_coords(levelnames) + else: + actual[name] = expect if unindexed_dims: actual = actual.drop_vars(unindexed_dims) @@ -361,7 +366,8 @@ def wrapper(array, *by, func, skipna, **kwargs): template = obj else: template = obj[var] - actual[var] = _restore_dim_order(actual[var], template, by[0]) + if actual[var].ndim > 1: + actual[var] = _restore_dim_order(actual[var], template, by[0]) if missing_dim: for k, v in missing_dim.items(): @@ -370,9 +376,9 @@ def wrapper(array, *by, func, skipna, **kwargs): } # The expand_dims is for backward compat with xarray's questionable behaviour if missing_group_dims: - actual[k] = v.expand_dims(missing_group_dims) + actual[k] = v.expand_dims(missing_group_dims).variable else: - actual[k] = v + actual[k] = v.variable if isinstance(obj, xr.DataArray): return obj._from_temp_dataset(actual) diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 52a28e549..a25bb5559 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -321,6 +321,17 @@ def test_multi_index_groupby_sum(engine): actual = xarray_reduce(stacked, "space", dim="z", func="sum", engine=engine) assert_equal(expected, actual.unstack("space")) + actual = xarray_reduce(stacked.foo, "space", dim="z", func="sum", engine=engine) + assert_equal(expected.foo, actual.unstack("space")) + + ds = xr.Dataset( + dict(a=(("z",), np.ones(10))), + coords=dict(b=(("z"), np.arange(2).repeat(5)), c=(("z"), np.arange(5).repeat(2))), + ).set_index(bc=["b", "c"]) + expected = ds.groupby("bc").sum() + actual = xarray_reduce(ds, "bc", func="sum") + assert_equal(expected, actual) + @pytest.mark.parametrize("chunks", (None, 2)) def test_xarray_groupby_bins(chunks, engine): From ccbda27cc477580f3b3b847a32b5f1a7722a1f48 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 2 Jun 2022 17:23:20 -0600 Subject: [PATCH 19/44] Convert datetimes to numeric. (#108) --- flox/xarray.py | 22 ++++++ flox/xrutils.py | 173 +++++++++++++++++++++++++++++++++++++++++++ tests/test_xarray.py | 7 +- 3 files changed, 199 insertions(+), 3 deletions(-) diff --git a/flox/xarray.py b/flox/xarray.py index 95edb843c..b00a85c74 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -6,6 +6,7 @@ import pandas as pd import xarray as xr from packaging.version import Version +from xarray.core.duck_array_ops import _datetime_nanmin from .aggregations import Aggregation, _atleast_1d from .core import ( @@ -15,6 +16,7 @@ rechunk_for_blockwise as rechunk_array_for_blockwise, rechunk_for_cohorts as rechunk_array_for_cohorts, ) +from .xrutils import _contains_cftime_datetimes, _to_pytimedelta, datetime_to_numeric if TYPE_CHECKING: from xarray import DataArray, Dataset, Resample @@ -289,7 +291,27 @@ def wrapper(array, *by, func, skipna, **kwargs): if "nan" not in func and func not in ["all", "any", "count"]: func = f"nan{func}" + requires_numeric = func not in ["count", "any", "all"] + if requires_numeric: + is_npdatetime = array.dtype.kind in "Mm" + is_cftime = _contains_cftime_datetimes(array) + if is_npdatetime: + offset = _datetime_nanmin(array) + # xarray always uses np.datetime64[ns] for np.datetime64 data + dtype = "timedelta64[ns]" + array = datetime_to_numeric(array, offset) + elif _contains_cftime_datetimes(array): + offset = min(array) + array = datetime_to_numeric(array, offset, datetime_unit="us") + result, *groups = groupby_reduce(array, *by, func=func, **kwargs) + + if requires_numeric: + if is_npdatetime: + return result.astype(dtype) + offset + elif is_cftime: + return _to_pytimedelta(result, unit="us") + offset + return result # These data variables do not have any of the core dimension, diff --git a/flox/xrutils.py b/flox/xrutils.py index 9357742a2..047a83408 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -2,11 +2,18 @@ # defined in xarray +import datetime from typing import Any, Iterable import numpy as np import pandas as pd +try: + import cftime +except ImportError: + cftime = None + + try: import dask.array @@ -15,6 +22,10 @@ dask_array_type = () +def asarray(data, xp=np): + return data if is_duck_array(data) else xp.asarray(data) + + def is_duck_array(value: Any) -> bool: """Checks if value is a duck array.""" if isinstance(value, np.ndarray): @@ -110,3 +121,165 @@ def isnull(data): # a null value as well as NaN, but it isn't clear how to do this # with duck typing. return data != data + + +def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): + """Convert an array containing datetime-like data to numerical values. + Convert the datetime array to a timedelta relative to an offset. + Parameters + ---------- + array : array-like + Input data + offset : None, datetime or cftime.datetime + Datetime offset. If None, this is set by default to the array's minimum + value to reduce round off errors. + datetime_unit : {None, Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as} + If not None, convert output to a given datetime unit. Note that some + conversions are not allowed due to non-linear relationships between units. + dtype : dtype + Output dtype. + Returns + ------- + array + Numerical representation of datetime object relative to an offset. + Notes + ----- + Some datetime unit conversions won't work, for example from days to years, even + though some calendars would allow for them (e.g. no_leap). This is because there + is no `cftime.timedelta` object. + """ + # TODO: make this function dask-compatible? + # Set offset to minimum if not given + from xarray.core.duck_array_ops import _datetime_nanmin + + if offset is None: + if array.dtype.kind in "Mm": + offset = _datetime_nanmin(array) + else: + offset = min(array) + + # Compute timedelta object. + # For np.datetime64, this can silently yield garbage due to overflow. + # One option is to enforce 1970-01-01 as the universal offset. + + # This map_blocks call is for backwards compatibility. + # dask == 2021.04.1 does not support subtracting object arrays + # which is required for cftime + if is_duck_dask_array(array) and np.issubdtype(array.dtype, object): + array = array.map_blocks(lambda a, b: a - b, offset, meta=array._meta) + else: + array = array - offset + + # Scalar is converted to 0d-array + if not hasattr(array, "dtype"): + array = np.array(array) + + # Convert timedelta objects to float by first converting to microseconds. + if array.dtype.kind in "O": + return py_timedelta_to_float(array, datetime_unit or "ns").astype(dtype) + + # Convert np.NaT to np.nan + elif array.dtype.kind in "mM": + + # Convert to specified timedelta units. + if datetime_unit: + array = array / np.timedelta64(1, datetime_unit) + return np.where(isnull(array), np.nan, array.astype(dtype)) + + +def timedelta_to_numeric(value, datetime_unit="ns", dtype=float): + """Convert a timedelta-like object to numerical values. + + Parameters + ---------- + value : datetime.timedelta, numpy.timedelta64, pandas.Timedelta, str + Time delta representation. + datetime_unit : {Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as} + The time units of the output values. Note that some conversions are not allowed due to + non-linear relationships between units. + dtype : type + The output data type. + + """ + import datetime as dt + + if isinstance(value, dt.timedelta): + out = py_timedelta_to_float(value, datetime_unit) + elif isinstance(value, np.timedelta64): + out = np_timedelta64_to_float(value, datetime_unit) + elif isinstance(value, pd.Timedelta): + out = pd_timedelta_to_float(value, datetime_unit) + elif isinstance(value, str): + try: + a = pd.to_timedelta(value) + except ValueError: + raise ValueError( + f"Could not convert {value!r} to timedelta64 using pandas.to_timedelta" + ) + return py_timedelta_to_float(a, datetime_unit) + else: + raise TypeError( + f"Expected value of type str, pandas.Timedelta, datetime.timedelta " + f"or numpy.timedelta64, but received {type(value).__name__}" + ) + return out.astype(dtype) + + +def _to_pytimedelta(array, unit="us"): + return array.astype(f"timedelta64[{unit}]").astype(datetime.timedelta) + + +def np_timedelta64_to_float(array, datetime_unit): + """Convert numpy.timedelta64 to float. + + Notes + ----- + The array is first converted to microseconds, which is less likely to + cause overflow errors. + """ + array = array.astype("timedelta64[ns]").astype(np.float64) + conversion_factor = np.timedelta64(1, "ns") / np.timedelta64(1, datetime_unit) + return conversion_factor * array + + +def pd_timedelta_to_float(value, datetime_unit): + """Convert pandas.Timedelta to float. + + Notes + ----- + Built on the assumption that pandas timedelta values are in nanoseconds, + which is also the numpy default resolution. + """ + value = value.to_timedelta64() + return np_timedelta64_to_float(value, datetime_unit) + + +def _timedelta_to_seconds(array): + return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6 + + +def py_timedelta_to_float(array, datetime_unit): + """Convert a timedelta object to a float, possibly at a loss of resolution.""" + array = asarray(array) + if is_duck_dask_array(array): + array = array.map_blocks(_timedelta_to_seconds, meta=np.array([], dtype=np.float64)) + else: + array = _timedelta_to_seconds(array) + conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit) + return conversion_factor * array + + +def _contains_cftime_datetimes(array) -> bool: + """Check if an array contains cftime.datetime objects""" + if cftime is None: + return False + else: + if array.dtype == np.dtype("O") and array.size > 0: + sample = array.ravel()[0] + if is_duck_dask_array(sample): + sample = sample.compute() + if isinstance(sample, np.ndarray): + sample = sample.item() + return isinstance(sample, cftime.datetime) + else: + return False diff --git a/tests/test_xarray.py b/tests/test_xarray.py index a25bb5559..30e103b4d 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -414,13 +414,14 @@ def test_cache(): @pytest.mark.parametrize("use_cftime", [True, False]) -def test_datetime_array_reduce(use_cftime): +@pytest.mark.parametrize("func", ["count", "mean"]) +def test_datetime_array_reduce(use_cftime, func): time = xr.DataArray( xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime), dims=("time",), name="time", ) - expected = time.resample(time="YS").count() # fails - actual = resample_reduce(time.resample(time="YS"), func="count", engine="flox") + expected = getattr(time.resample(time="YS"), func)() + actual = resample_reduce(time.resample(time="YS"), func=func, engine="flox") assert_equal(expected, actual) From 915a91bb0711a475abfc60c0f502d57c3960a5dc Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 3 Jun 2022 17:13:30 -0600 Subject: [PATCH 20/44] Fix reduction along dimensions not in groupers. xref #109 --- flox/xarray.py | 2 +- tests/test_xarray.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/flox/xarray.py b/flox/xarray.py index b00a85c74..fe513dda2 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -241,7 +241,7 @@ def xarray_reduce( raise ValueError(f"Cannot reduce over absent dimensions {dim}.") dims_not_in_groupers = tuple(d for d in dim if d not in grouper_dims) - if dims_not_in_groupers == dim and not any(isbin): + if dims_not_in_groupers == tuple(dim) and not any(isbin): # reducing along a dimension along which groups do not vary # This is really just a normal reduction. # This is not right when binning so we exclude. diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 30e103b4d..cb74617ea 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -188,6 +188,11 @@ def test_xarray_reduce_single_grouper(engine): actual = xarray_reduce(ds, ds.time.dt.month, func="mean", engine=engine) xr.testing.assert_allclose(actual, expected) + # reduce along other dimensions + expected = ds.groupby("time.month").mean(("x", "y")) + actual = xarray_reduce(ds, ds.time.dt.month, dim=["x", "y"], func="mean", engine=engine) + xr.testing.assert_allclose(actual, expected) + # add data var with missing grouper dim ds["foo"] = ("bar", [1, 2, 3]) expected = ds.groupby("time.month").mean() From 36a3769d555ca39ecadf46d2bde3cccec070aca9 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 3 Jun 2022 17:29:07 -0600 Subject: [PATCH 21/44] Fix grouping by datetime --- flox/core.py | 4 ++-- tests/test_core.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/flox/core.py b/flox/core.py index 008d3f483..38f4a12ee 100644 --- a/flox/core.py +++ b/flox/core.py @@ -162,7 +162,7 @@ def find_group_cohorts(labels, chunks, merge=True, method="cohorts"): labels = np.asarray(labels) if method == "split-reduce": - return _get_expected_groups(labels, sort=False).values.reshape(-1, 1).tolist() + return list(_get_expected_groups(labels, sort=False).to_numpy().reshape(-1, 1)) # Build an array with the shape of labels, but where every element is the "chunk number" # 1. First subset the array appropriately @@ -1225,7 +1225,7 @@ def dask_groupby_agg( if method == "map-reduce": if expected_groups is None: expected_groups = _get_expected_groups(by_input, sort=sort) - groups = (expected_groups.values,) + groups = (expected_groups.to_numpy(),) else: groups = (np.concatenate(groups_in_block),) diff --git a/tests/test_core.py b/tests/test_core.py index 71e3b9bed..320972d59 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -853,3 +853,19 @@ def test_map_reduce_blockwise_mixed(): ) expected = groupby_reduce(data, t.dt.month, func="mean") assert_equal(expected, actual) + + +@requires_dask +@pytest.mark.parametrize("method", ["blockwise", "split-reduce", "map-reduce", "cohorts"]) +def test_group_by_datetime(engine, method): + t = pd.date_range("2000-01-01", "2000-12-31", freq="D").to_series() + data = t.dt.dayofyear + actual, _ = groupby_reduce( + dask.array.from_array(data.values, chunks=365), + t, + func="mean", + method=method, + engine=engine, + ) + expected = data.to_numpy().astype(float) + assert_equal(expected, actual) From 6fb807d36502f8152372df7898ac49daab6cad32 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 3 Jun 2022 20:10:18 -0600 Subject: [PATCH 22/44] Fix binning by datetime. --- flox/core.py | 10 +++++----- tests/test_core.py | 31 +++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/flox/core.py b/flox/core.py index 38f4a12ee..049267287 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1566,7 +1566,7 @@ def groupby_reduce( if kwargs["fill_value"] is None: kwargs["fill_value"] = agg.fill_value[agg.name] - partial_agg = partial(dask_groupby_agg, agg=agg, split_out=split_out, **kwargs) + partial_agg = partial(dask_groupby_agg, split_out=split_out, **kwargs) if method in ["split-reduce", "cohorts"]: cohorts = find_group_cohorts( @@ -1585,15 +1585,14 @@ def groupby_reduce( array_subset = np.take(array_subset, idxr, axis=ax) numblocks = np.prod([len(array_subset.chunks[ax]) for ax in axis]) - # First deep copy becasue we might be doping blockwise, - # which sets agg.finalize=None, then map-reduce (GH102) - agg = copy.deepcopy(agg) - # get final result for these groups r, *g = partial_agg( array_subset, by[np.ix_(*indexer)], expected_groups=pd.Index(cohort), + # First deep copy becasue we might be doping blockwise, + # which sets agg.finalize=None, then map-reduce (GH102) + agg=copy.deepcopy(agg), # reindex to expected_groups at the blockwise step. # this approach avoids replacing non-cohort members with # np.nan or some other sentinel value, and preserves dtypes @@ -1619,6 +1618,7 @@ def groupby_reduce( array, by, expected_groups=None if method == "blockwise" else expected_groups, + agg=agg, reindex=reindex, method=method, sort=sort, diff --git a/tests/test_core.py b/tests/test_core.py index 320972d59..8d1226a03 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -856,16 +856,35 @@ def test_map_reduce_blockwise_mixed(): @requires_dask -@pytest.mark.parametrize("method", ["blockwise", "split-reduce", "map-reduce", "cohorts"]) +@pytest.mark.parametrize("method", ["split-reduce", "blockwise", "map-reduce", "cohorts"]) def test_group_by_datetime(engine, method): - t = pd.date_range("2000-01-01", "2000-12-31", freq="D").to_series() - data = t.dt.dayofyear - actual, _ = groupby_reduce( - dask.array.from_array(data.values, chunks=365), - t, + kwargs = dict( func="mean", method=method, engine=engine, ) + t = pd.date_range("2000-01-01", "2000-12-31", freq="D").to_series() + data = t.dt.dayofyear + daskarray = dask.array.from_array(data.values, chunks=30) + + actual, _ = groupby_reduce(daskarray, t, **kwargs) expected = data.to_numpy().astype(float) assert_equal(expected, actual) + + if method == "blockwise": + return None + + edges = pd.date_range("1999-12-31", "2000-12-31", freq="M").to_series().to_numpy() + actual, _ = groupby_reduce(daskarray, t.to_numpy(), isbin=True, expected_groups=edges, **kwargs) + expected = data.resample("M").mean().to_numpy() + assert_equal(expected, actual) + + actual, _ = groupby_reduce( + np.broadcast_to(daskarray, (2, 3, daskarray.shape[-1])), + t.to_numpy(), + isbin=True, + expected_groups=edges, + **kwargs, + ) + expected = np.broadcast_to(expected, (2, 3, expected.shape[-1])) + assert_equal(expected, actual) From 1468aebf957f6632b3b45a79eed5fe21808c2b95 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 3 Jun 2022 17:50:50 -0600 Subject: [PATCH 23/44] keep_attrs is True by default --- flox/xarray.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flox/xarray.py b/flox/xarray.py index fe513dda2..15e2ff148 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -62,7 +62,7 @@ def xarray_reduce( fill_value=None, method: str = "map-reduce", engine: str = "flox", - keep_attrs: bool = True, + keep_attrs: bool | None = True, skipna: bool | None = None, min_count: int | None = None, reindex: bool | None = None, @@ -181,6 +181,10 @@ def xarray_reduce( if isinstance(b, xr.DataArray) and b.name is None: raise ValueError("Cannot group by unnamed DataArrays.") + # TODO: move to GroupBy._flox_reduce + if keep_attrs is None: + keep_attrs = True + if isinstance(isbin, bool): isbin = (isbin,) * len(by) if expected_groups is None: From 759694bffd30b367ff5575ba5f31cf9adb12689d Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 3 Jun 2022 20:15:16 -0600 Subject: [PATCH 24/44] Fix sort being specified twice. --- flox/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flox/core.py b/flox/core.py index 049267287..7613c1f24 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1548,12 +1548,12 @@ def groupby_reduce( # overwrite than when min_count is set fill_value = np.nan - kwargs = dict(axis=axis, fill_value=fill_value, engine=engine, sort=sort) + kwargs = dict(axis=axis, fill_value=fill_value, engine=engine) agg = _initialize_aggregation(func, array.dtype, fill_value, min_count, finalize_kwargs) if not has_dask: results = _reduce_blockwise( - array, by, agg, expected_groups=expected_groups, reindex=reindex, **kwargs + array, by, agg, expected_groups=expected_groups, reindex=reindex, sort=sort, **kwargs ) groups = (results["groups"],) result = results[agg.name] From a0b9d1f2279c7449781d89d0d6b1a8c006fe1ad0 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 3 Jun 2022 17:45:44 -0600 Subject: [PATCH 25/44] Fix binning by nD variable --- flox/core.py | 2 +- tests/test_xarray.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/flox/core.py b/flox/core.py index 7613c1f24..b4e68c23f 100644 --- a/flox/core.py +++ b/flox/core.py @@ -462,7 +462,7 @@ def factorize_( group_idx = factorized[0] if fastpath: - return group_idx, found_groups, grp_shape + return group_idx.reshape(by[0].shape), found_groups, grp_shape if np.isscalar(axis) and groupvar.ndim > 1: # Not reducing along all dimensions of by diff --git a/tests/test_xarray.py b/tests/test_xarray.py index cb74617ea..5f779d0ad 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -430,3 +430,24 @@ def test_datetime_array_reduce(use_cftime, func): expected = getattr(time.resample(time="YS"), func)() actual = resample_reduce(time.resample(time="YS"), func=func, engine="flox") assert_equal(expected, actual) + + +@requires_dask +def test_groupby_bins_indexed_coordinate(): + ds = ( + xr.tutorial.open_dataset("air_temperature") + .isel(time=slice(100)) + .chunk({"time": 20, "lat": 5}) + ) + bins = [40, 50, 60, 70] + expected = ds.groupby_bins("lat", bins=bins).mean(keep_attrs=True, dim=...) + actual = xarray_reduce( + ds, + ds.lat, + dim=ds.air.dims, + expected_groups=([40, 50, 60, 70],), + isbin=(True,), + func="mean", + method="split-reduce", + ) + xr.testing.assert_allclose(expected, actual) From 5e0b79331ef07c58b650ee88c921b8d09d34ef80 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 23 Jun 2022 16:44:15 -0600 Subject: [PATCH 26/44] Fix bug where we had extra groups in expected_groups. (#112) * Fix bug where we had extra groups in expected_groups. This affected _factorize_multiple. Closes #111 * Fix extra expected groups (#113) * fix dask case * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: LunarLanding <4441338+LunarLanding@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- flox/core.py | 9 ++++++--- tests/test_xarray.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/flox/core.py b/flox/core.py index b4e68c23f..6de0db58f 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1310,11 +1310,12 @@ def _lazy_factorize_wrapper(*by, **kwargs): return group_idx -def _factorize_multiple(by, expected_groups, by_is_dask): +def _factorize_multiple(by, expected_groups, by_is_dask, reindex): kwargs = dict( expected_groups=expected_groups, axis=None, # always None, we offset later if necessary. fastpath=True, + reindex=reindex, ) if by_is_dask: import dask.array @@ -1325,7 +1326,9 @@ def _factorize_multiple(by, expected_groups, by_is_dask): meta=np.array((), dtype=np.int64), **kwargs, ) - found_groups = tuple(None if is_duck_dask_array(b) else pd.unique(b) for b in by) + found_groups = tuple( + None if is_duck_dask_array(b) else pd.unique(np.array(b).reshape(-1)) for b in by + ) grp_shape = tuple(len(e) for e in expected_groups) else: group_idx, found_groups, grp_shape = factorize_(by, **kwargs) @@ -1489,7 +1492,7 @@ def groupby_reduce( ) if factorize_early: by, final_groups, grp_shape = _factorize_multiple( - by, expected_groups, by_is_dask=by_is_dask + by, expected_groups, by_is_dask=by_is_dask, reindex=reindex ) expected_groups = (pd.RangeIndex(np.prod(grp_shape)),) diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 5f779d0ad..fa54e4e54 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -451,3 +451,37 @@ def test_groupby_bins_indexed_coordinate(): method="split-reduce", ) xr.testing.assert_allclose(expected, actual) + + +@pytest.mark.parametrize("chunk", (True, False)) +def test_mixed_grouping(chunk): + if not has_dask and chunk: + pytest.skip() + # regression test for https://github.com/dcherian/flox/pull/111 + sa = 10 + sb = 13 + sc = 3 + + x = xr.Dataset( + { + "v0": xr.DataArray( + ((np.arange(sa * sb * sc) / sa) % 1).reshape((sa, sb, sc)), + dims=("a", "b", "c"), + ), + "v1": xr.DataArray((np.arange(sa * sb) % 3).reshape(sa, sb), dims=("a", "b")), + } + ) + if chunk: + x["v0"] = x["v0"].chunk({"a": 5}) + + r = xarray_reduce( + x["v0"], + x["v1"], + x["v0"], + expected_groups=(np.arange(6), np.linspace(0, 1, num=5)), + isbin=[False, True], + func="count", + dim="b", + fill_value=0, + ) + assert (r.sel(v1=[3, 4, 5]) == 0).all().data From f65cb521694eab435bf4fbaa6de42c7eab800d63 Mon Sep 17 00:00:00 2001 From: dcherian Date: Wed, 29 Jun 2022 21:48:23 -0600 Subject: [PATCH 27/44] Avoid a copy. --- flox/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flox/core.py b/flox/core.py index 6de0db58f..c28db22b8 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1327,7 +1327,7 @@ def _factorize_multiple(by, expected_groups, by_is_dask, reindex): **kwargs, ) found_groups = tuple( - None if is_duck_dask_array(b) else pd.unique(np.array(b).reshape(-1)) for b in by + None if is_duck_dask_array(b) else pd.unique(b.reshape(-1)) for b in by ) grp_shape = tuple(len(e) for e in expected_groups) else: From dcfb1dba3c6bdc035483d92a42bda2deb31ecf23 Mon Sep 17 00:00:00 2001 From: dcherian Date: Wed, 29 Jun 2022 21:48:33 -0600 Subject: [PATCH 28/44] Better error message --- flox/core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flox/core.py b/flox/core.py index c28db22b8..a877969fb 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1480,7 +1480,10 @@ def groupby_reduce( if len(by) == 1 and not isinstance(expected_groups, tuple): expected_groups = (np.asarray(expected_groups),) elif len(expected_groups) != len(by): - raise ValueError("len(expected_groups) != len(by)") + raise ValueError( + f"Must have same number of `expected_groups` (received {len(expected_groups)}) " + f" and variables to group by (received {len(by)})." + ) # We convert to pd.Index since that lets us know if we are binning or not # (pd.IntervalIndex or not) From 56461799e646a7ac8ceb100923cb0d2f5912ba58 Mon Sep 17 00:00:00 2001 From: dcherian Date: Wed, 29 Jun 2022 21:35:58 -0600 Subject: [PATCH 29/44] Correctly factorize values outside bin edges --- flox/core.py | 8 ++++++-- tests/test_core.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/flox/core.py b/flox/core.py index a877969fb..b188c86e3 100644 --- a/flox/core.py +++ b/flox/core.py @@ -431,6 +431,7 @@ def factorize_( # pd.cut with bins = IntervalIndex[datetime64] doesn't work... if groupvar.dtype.kind == "M": expect = np.concatenate([expect.left.to_numpy(), [expect.right[-1].to_numpy()]]) + # code is -1 for values outside the bounds of all intervals idx = pd.cut(groupvar.ravel(), bins=expect).codes.copy() else: if expect is not None and reindex: @@ -455,9 +456,12 @@ def factorize_( grp_shape = tuple(len(grp) for grp in found_groups) ngroups = np.prod(grp_shape) if len(by) > 1: - group_idx = np.ravel_multi_index(factorized, grp_shape, mode="wrap").reshape(by[0].shape) - nan_by_mask = reduce(np.logical_or, [isnull(b) for b in by]) + group_idx = np.ravel_multi_index(factorized, grp_shape, mode="wrap") + # NaNs; as well as values outside the bins are coded by -1 + # Restore these after the raveling + nan_by_mask = reduce(np.logical_or, [(f == -1) for f in factorized]) group_idx[nan_by_mask] = -1 + group_idx = group_idx.reshape(by[0].shape) else: group_idx = factorized[0] diff --git a/tests/test_core.py b/tests/test_core.py index 8d1226a03..0e0de52b7 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -888,3 +888,20 @@ def test_group_by_datetime(engine, method): ) expected = np.broadcast_to(expected, (2, 3, expected.shape[-1])) assert_equal(expected, actual) + + +def test_factorize_values_outside_bins(): + + vals = factorize_( + (np.arange(10).reshape(5, 2), np.arange(10).reshape(5, 2)), + axis=(0, 1), + expected_groups=( + pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), + pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), + ), + reindex=True, + fastpath=True, + ) + actual = vals[0] + expected = np.array([[-1, -1], [-1, 0], [6, 12], [18, 24], [-1, -1]]) + assert_equal(expected, actual) From c387edcf45e490915e2702a3cf805cf38a518a00 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 30 Jun 2022 12:04:20 -0600 Subject: [PATCH 30/44] Fix factorizing some more. (#115) --- flox/core.py | 21 ++++++++--------- tests/test_core.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/flox/core.py b/flox/core.py index b188c86e3..dc121bf83 100644 --- a/flox/core.py +++ b/flox/core.py @@ -421,6 +421,7 @@ def factorize_( factorized = [] found_groups = [] for groupvar, expect in zip(by, expected_groups): + flat = groupvar.ravel() if isinstance(expect, pd.IntervalIndex): # when binning we change expected groups to integers marking the interval # this makes the reindexing logic simpler. @@ -432,21 +433,19 @@ def factorize_( if groupvar.dtype.kind == "M": expect = np.concatenate([expect.left.to_numpy(), [expect.right[-1].to_numpy()]]) # code is -1 for values outside the bounds of all intervals - idx = pd.cut(groupvar.ravel(), bins=expect).codes.copy() + idx = pd.cut(flat, bins=expect).codes.copy() else: if expect is not None and reindex: - groups = expect + sorter = np.argsort(expect) + groups = expect[(sorter,)] if sort else expect + idx = np.searchsorted(expect, flat, sorter=sorter) + mask = ~np.isin(flat, expect) | isnull(flat) | (idx == len(expect)) if not sort: - sorter = np.argsort(expect) - else: - sorter = None - idx = np.searchsorted(expect, groupvar.ravel(), sorter=sorter) - mask = isnull(groupvar.ravel()) | (idx == len(expect)) - # TODO: optimize? + # idx is the index in to the sorted array. + # if we didn't want sorting, unsort it back + idx[(idx == len(expect),)] = -1 + idx = sorter[(idx,)] idx[mask] = -1 - if not sort: - idx = sorter[idx] - idx[mask] = -1 else: idx, groups = pd.factorize(groupvar.ravel(), sort=sort) diff --git a/tests/test_core.py b/tests/test_core.py index 0e0de52b7..7f26bbc49 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -905,3 +905,62 @@ def test_factorize_values_outside_bins(): actual = vals[0] expected = np.array([[-1, -1], [-1, 0], [6, 12], [18, 24], [-1, -1]]) assert_equal(expected, actual) + + +def test_multiple_groupers(): + actual, *_ = groupby_reduce( + np.ones((5, 2)), + np.arange(10).reshape(5, 2), + np.arange(10).reshape(5, 2), + axis=(0, 1), + expected_groups=( + pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), + pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), + ), + reindex=True, + func="count", + ) + expected = np.eye(5, 5) + assert_equal(expected, actual) + + +def test_factorize_reindex_sorting_strings(): + kwargs = dict( + by=(np.array(["El-Nino", "La-Nina", "boo", "Neutral"]),), + axis=-1, + expected_groups=(np.array(["El-Nino", "Neutral", "foo", "La-Nina"]),), + ) + + expected = factorize_(**kwargs, reindex=True, sort=True)[0] + assert_equal(expected, [0, 1, 4, 2]) + + expected = factorize_(**kwargs, reindex=True, sort=False)[0] + assert_equal(expected, [0, 3, 4, 1]) + + expected = factorize_(**kwargs, reindex=False, sort=False)[0] + assert_equal(expected, [0, 1, 2, 3]) + + expected = factorize_(**kwargs, reindex=False, sort=True)[0] + assert_equal(expected, [0, 1, 3, 2]) + + +def test_factorize_reindex_sorting_ints(): + kwargs = dict( + by=(np.array([-10, 1, 10, 2, 3, 5]),), + axis=-1, + expected_groups=(np.array([0, 1, 2, 3, 4, 5]),), + ) + + expected = factorize_(**kwargs, reindex=True, sort=True)[0] + assert_equal(expected, [6, 1, 6, 2, 3, 5]) + + expected = factorize_(**kwargs, reindex=True, sort=False)[0] + assert_equal(expected, [6, 1, 6, 2, 3, 5]) + + kwargs["expected_groups"] = (np.arange(5, -1, -1),) + + expected = factorize_(**kwargs, reindex=True, sort=True)[0] + assert_equal(expected, [6, 1, 6, 2, 3, 5]) + + expected = factorize_(**kwargs, reindex=True, sort=False)[0] + assert_equal(expected, [6, 4, 6, 3, 2, 0]) From 5e2ad209cffdc138e69966b7640fa2c00998d86c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Jul 2022 06:37:44 -0600 Subject: [PATCH 31/44] Bump actions/setup-python from 3 to 4 (#116) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 3 to 4. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/pypi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index db4ffd715..7372dd0fa 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -10,7 +10,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: '3.x' - name: Install dependencies From de63e380b0606eb82918cd167e804885cb58cd16 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 1 Jul 2022 21:21:41 -0600 Subject: [PATCH 32/44] Don't execute hourly climatology notebook (#98) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/conf.py | 6 +++++- docs/source/user-stories/climatology-hourly.ipynb | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 9c00d8c83..6fda3c575 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,10 +57,14 @@ current_year = datetime.datetime.now().year copyright = f"2021-{current_year}, Deepak Cherian" author = "Deepak Cherian" + +# Myst_nb options +nb_execution_excludepatterns = ["climatology-hourly.ipynb"] +nb_execution_raise_on_error = True + # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. -# # The short X.Y version. version = flox.__version__.split("+")[0] # The full version, including alpha/beta/rc tags. diff --git a/docs/source/user-stories/climatology-hourly.ipynb b/docs/source/user-stories/climatology-hourly.ipynb index 91f2cce9c..5fef37851 100644 --- a/docs/source/user-stories/climatology-hourly.ipynb +++ b/docs/source/user-stories/climatology-hourly.ipynb @@ -1702,7 +1702,9 @@ "metadata": {}, "source": [ "View the performance report\n", - "[here](https://rawcdn.githack.com/dcherian/flox/592c46ba0bb859f732968b68426b6332caebc213/docs/source/user-stories/hourly-climatology.html)\n" + "[here](https://rawcdn.githack.com/dcherian/flox/592c46ba0bb859f732968b68426b6332caebc213/docs/source/user-stories/hourly-climatology.html),\n", + "and a video of the dask dashboard\n", + "[here](https://drive.google.com/file/d/1uY36DiTbv1w7TefbrCEyBcOli5NiaNUP/view?usp=sharing)\n" ] } ], @@ -1724,6 +1726,9 @@ "pygments_lexer": "ipython3", "version": "3.9.12" }, + "mystnb": { + "execution_mode": "off" + }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { From ded9b31c1509f14ab44b291c45687d9e9df173f9 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 1 Jul 2022 21:21:50 -0600 Subject: [PATCH 33/44] Allow custom numpy aggregations for "blockwise" (#117) --- flox/core.py | 7 +++++-- tests/test_core.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/flox/core.py b/flox/core.py index dc121bf83..b199878d5 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1568,8 +1568,11 @@ def groupby_reduce( result = results[agg.name] else: - if agg.chunk is None: - raise NotImplementedError(f"{func} not implemented for dask arrays") + if agg.chunk[0] is None and method != "blockwise": + raise NotImplementedError( + f"Aggregation {func.name!r} is only implemented for dask arrays when method='blockwise'." + f"\n\n Received: {func}" + ) # we always need some fill_value (see above) so choose the default if needed if kwargs["fill_value"] is None: diff --git a/tests/test_core.py b/tests/test_core.py index 7f26bbc49..9c0bd6adb 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -5,6 +5,7 @@ import pytest from numpy_groupies.aggregate_numpy import aggregate +from flox.aggregations import Aggregation from flox.core import ( _convert_expected_groups_to_index, _get_optimal_chunks_for_groups, @@ -964,3 +965,47 @@ def test_factorize_reindex_sorting_ints(): expected = factorize_(**kwargs, reindex=True, sort=False)[0] assert_equal(expected, [6, 4, 6, 3, 2, 0]) + + +@requires_dask +def test_custom_aggregation_blockwise(): + def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): + return aggregate( + group_idx, + array, + func=np.median, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + agg_median = Aggregation( + name="median", numpy=grouped_median, fill_value=-1, chunk=None, combine=None + ) + + array = np.arange(100, dtype=np.float32).reshape(5, 20) + by = np.ones((20,)) + + actual, _ = groupby_reduce(array, by, func=agg_median, axis=-1) + expected = np.median(array, axis=-1, keepdims=True) + assert_equal(expected, actual) + + for method in ["map-reduce", "cohorts", "split-reduce"]: + with pytest.raises(NotImplementedError): + groupby_reduce( + dask.array.from_array(array, chunks=(1, -1)), + by, + func=agg_median, + axis=-1, + method=method, + ) + + actual, _ = groupby_reduce( + dask.array.from_array(array, chunks=(1, -1)), + by, + func=agg_median, + axis=-1, + method="blockwise", + ) + assert_equal(expected, actual) From d78156fe5dcf031ec6f4393f946bacdbc2b71a7a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Jul 2022 15:28:31 -0600 Subject: [PATCH 34/44] [pre-commit.ci] pre-commit autoupdate (#118) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.1.0 → v4.3.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.1.0...v4.3.0) - [github.com/psf/black: 22.3.0 → 22.6.0](https://github.com/psf/black/compare/22.3.0...22.6.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf1fd69d6..1d4aaa3c1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,14 +3,14 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v4.3.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-docstring-first - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 22.6.0 hooks: - id: black From 04cf4d71fbc5cfc79b4aa9db406ca135eb1a20df Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 5 Jul 2022 20:52:50 -0600 Subject: [PATCH 35/44] Add NASA Xarray grant --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fbec8bab2..de2f86633 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions)[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/dcherian/flox/main.svg)](https://results.pre-commit.ci/latest/github/dcherian/flox/main)[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox)[![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/)[![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox)[![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest)[![NASA-80NSSC18M0156](https://img.shields.io/badge/NASA-80NSSC18M0156-blue)](https://earthdata.nasa.gov/esds/competitive-programs/access/pangeo-ml) +[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions)[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/dcherian/flox/main.svg)](https://results.pre-commit.ci/latest/github/dcherian/flox/main)[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox)[![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/)[![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox)[![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest)[![NASA-80NSSC18M0156](https://img.shields.io/badge/NASA-80NSSC18M0156-blue)](https://earthdata.nasa.gov/esds/competitive-programs/access/pangeo-ml)![NASA-80NSSC22K0345](https://img.shields.io/badge/NASA-80NSSC22K0345-blue)](https://science.nasa.gov/open-science-overview) # flox From e4c4e3061d5f330c293a5d1333c6c04322a3cf06 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 5 Jul 2022 20:53:18 -0600 Subject: [PATCH 36/44] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index de2f86633..4fcddddd8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions)[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/dcherian/flox/main.svg)](https://results.pre-commit.ci/latest/github/dcherian/flox/main)[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox)[![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/)[![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox)[![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest)[![NASA-80NSSC18M0156](https://img.shields.io/badge/NASA-80NSSC18M0156-blue)](https://earthdata.nasa.gov/esds/competitive-programs/access/pangeo-ml)![NASA-80NSSC22K0345](https://img.shields.io/badge/NASA-80NSSC22K0345-blue)](https://science.nasa.gov/open-science-overview) +[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions)[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/dcherian/flox/main.svg)](https://results.pre-commit.ci/latest/github/dcherian/flox/main)[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox)[![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/)[![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox)[![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest)[![NASA-80NSSC18M0156](https://img.shields.io/badge/NASA-80NSSC18M0156-blue)](https://earthdata.nasa.gov/esds/competitive-programs/access/pangeo-ml)[![NASA-80NSSC22K0345](https://img.shields.io/badge/NASA-80NSSC22K0345-blue)](https://science.nasa.gov/open-science-overview) # flox From d6065be6719784ce9ec0cf58486bafe2458d3785 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 5 Jul 2022 20:57:06 -0600 Subject: [PATCH 37/44] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4fcddddd8..bb2d241d5 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ about this package, from the Pangeo Showcase). ## Acknowledgements This work was funded in part by NASA-ACCESS 80NSSC18M0156 "Community tools for analysis of NASA Earth Observing System -Data in the Cloud" (PI J. Hamman), and [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). +Data in the Cloud" (PI J. Hamman, NCAR), NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire) and [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). It was motivated by [very](https://github.com/pangeo-data/pangeo/issues/266) [very](https://github.com/pangeo-data/pangeo/issues/271) [many](https://github.com/dask/distributed/issues/2602) [discussions](https://github.com/pydata/xarray/issues/2237) in the [Pangeo](https://pangeo.io) community. ## API From 61b134b796e6da6bba830f458face53d53bc2b10 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 8 Jul 2022 21:42:49 -0600 Subject: [PATCH 38/44] Add xarray groupby test (#121) * Add xarray groupby test * syntax * fix install * to-revert * fix * also revert * fix install * ref instead of ref_nae * activate env * Revert "activate env" This reverts commit e18ba3dc7fdf4633d4ff3f5a27815e69424de3ec. * activate * activate again * change shell * try again * Revert "also revert" This reverts commit e4f4297dd48dfa1a13a7e0b0a20c70b93315031c. * Revert "to-revert" This reverts commit a86fcd45aa55f05d6427ab9fa8534f2774809048. --- .github/workflows/ci.yaml | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8813e4da4..b22c249c7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -97,3 +97,46 @@ jobs: shell: bash -l {0} run: | pytest -n 2 + + xarray-groupby: + name: xarray-groupby + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v3 + with: + repository: 'pydata/xarray' + fetch-depth: 0 # Fetch all history for all branches and tags. + - name: Set up conda environment + uses: mamba-org/provision-with-micromamba@de032af7fb3675649f3d4bbdda85178ba412ee41 + with: + environment-file: ci/requirements/environment.yml + environment-name: xarray-tests + extra-specs: | + python="3.10" + - name: Install xarray + run: | + python -m pip install --no-deps . + - name: Install upstream flox + run: | + python -m pip install --no-deps \ + git+https://github.com/dcherian/flox.git@${{ github.ref }} + - name: Version info + run: | + conda info -a + conda list + python xarray/util/print_versions.py + - name: import xarray + run: | + python -c 'import xarray' + - name: import flox + run: | + python -c 'import flox' + - name: Run Tests + if: success() + id: status + run: | + set -euo pipefail + python -m pytest -n auto xarray/tests/test_groupby.py From 046cf68bd1f7bcf78e3f6eb6e71b3c289e4bc5df Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 8 Jul 2022 22:01:51 -0600 Subject: [PATCH 39/44] Many optimizations (#120) * Skip factorizing with RangeIndex fastpath for binning by multiple variables. * Workaround https://github.com/pandas-dev/pandas/issues/47614 * Avoid dispatching to pandas searchsorted * Remove unused variable. * ravel to reshape(-1) * Revert "Avoid dispatching to pandas searchsorted" This reverts commit 9aab6a4c194fa5c14b1f28ccb89dc7f8f8ebaa7d. --- flox/core.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/flox/core.py b/flox/core.py index b199878d5..f39a3fe4e 100644 --- a/flox/core.py +++ b/flox/core.py @@ -64,7 +64,7 @@ def _get_expected_groups(by, sort, *, raise_if_dask=True) -> pd.Index | None: if raise_if_dask: raise ValueError("Please provide expected_groups if not grouping by a numpy array.") return None - flatby = by.ravel() + flatby = by.reshape(-1) expected = pd.unique(flatby[~isnull(flatby)]) return _convert_expected_groups_to_index((expected,), isbin=(False,), sort=sort)[0] @@ -175,11 +175,11 @@ def find_group_cohorts(labels, chunks, merge=True, method="cohorts"): blocks = np.empty(np.prod(shape), dtype=object) for idx, block in enumerate(array.blocks.ravel()): blocks[idx] = np.full(tuple(block.shape[ax] for ax in axis), idx) - which_chunk = np.block(blocks.reshape(shape).tolist()).ravel() + which_chunk = np.block(blocks.reshape(shape).tolist()).reshape(-1) # We always drop NaN; np.unique also considers every NaN to be different so # it's really important we get rid of them. - raveled = labels.ravel() + raveled = labels.reshape(-1) unique_labels = np.unique(raveled[~isnull(raveled)]) # these are chunks where a label is present label_chunks = {lab: tuple(np.unique(which_chunk[raveled == lab])) for lab in unique_labels} @@ -421,19 +421,28 @@ def factorize_( factorized = [] found_groups = [] for groupvar, expect in zip(by, expected_groups): - flat = groupvar.ravel() - if isinstance(expect, pd.IntervalIndex): + flat = groupvar.reshape(-1) + if isinstance(expect, pd.RangeIndex): + idx = flat + found_groups.append(np.array(expect)) + # TODO: fix by using masked integers + idx[idx > expect[-1]] = -1 + + elif isinstance(expect, pd.IntervalIndex): # when binning we change expected groups to integers marking the interval # this makes the reindexing logic simpler. - if expect is None: - raise ValueError("Please pass bin edges in expected_groups.") - # TODO: fix for binning - found_groups.append(expect) - # pd.cut with bins = IntervalIndex[datetime64] doesn't work... + # workaround for https://github.com/pandas-dev/pandas/issues/47614 + # we create breaks and pass that to pd.cut, disallow closed="both" for now. + if expect.closed == "both": + raise NotImplementedError if groupvar.dtype.kind == "M": - expect = np.concatenate([expect.left.to_numpy(), [expect.right[-1].to_numpy()]]) + # pd.cut with bins = IntervalIndex[datetime64] doesn't work... + bins = np.concatenate([expect.left.to_numpy(), [expect.right[-1].to_numpy()]]) + else: + bins = np.concatenate([expect.left.to_numpy(), [expect.right[-1]]]) # code is -1 for values outside the bounds of all intervals - idx = pd.cut(flat, bins=expect).codes.copy() + idx = pd.cut(flat, bins=bins, right=expect.closed_right).codes.copy() + found_groups.append(expect) else: if expect is not None and reindex: sorter = np.argsort(expect) @@ -447,7 +456,7 @@ def factorize_( idx = sorter[(idx,)] idx[mask] = -1 else: - idx, groups = pd.factorize(groupvar.ravel(), sort=sort) + idx, groups = pd.factorize(flat, sort=sort) found_groups.append(np.array(groups)) factorized.append(idx) @@ -473,7 +482,7 @@ def factorize_( # we collapse to a 2D by and axis=-1 offset_group = True group_idx, size = offset_labels(group_idx.reshape(by[0].shape), ngroups) - group_idx = group_idx.ravel() + group_idx = group_idx.reshape(-1) else: size = ngroups offset_group = False @@ -622,7 +631,7 @@ def chunk_reduce( # avoid by factorizing again so indices=[2,2,2] is changed to # indices=[0,0,0]. This is necessary when combining block results # factorize can handle strings etc unlike digitize - group_idx, groups, found_groups_shape, ngroups, size, props = factorize_( + group_idx, groups, found_groups_shape, _, size, props = factorize_( (by,), axis, expected_groups=(expected_groups,), reindex=reindex, sort=sort ) groups = groups[0] From 908397cf15bd66de66a43475cc5b701d97e75c46 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 9 Jul 2022 11:14:29 -0600 Subject: [PATCH 40/44] micromamba in CI (#122) --- .github/workflows/ci-additional.yaml | 105 --------------------------- .github/workflows/ci.yaml | 94 ++++++++++++++---------- 2 files changed, 54 insertions(+), 145 deletions(-) delete mode 100644 .github/workflows/ci-additional.yaml diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml deleted file mode 100644 index a4dc9024b..000000000 --- a/.github/workflows/ci-additional.yaml +++ /dev/null @@ -1,105 +0,0 @@ -name: CI Additional -on: - push: - branches: - - "main" - pull_request: - branches: - - "*" - schedule: - - cron: "0 0 * * *" # Daily “At 00:00” - workflow_dispatch: # allows you to trigger manually - -jobs: - detect-ci-trigger: - name: detect ci trigger - runs-on: ubuntu-latest - if: | - github.repository == 'dcherian/flox' - && (github.event_name == 'push' || github.event_name == 'pull_request') - outputs: - triggered: ${{ steps.detect-trigger.outputs.trigger-found }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - uses: xarray-contrib/ci-trigger@v1.1 - id: detect-trigger - with: - keyword: "[skip-ci]" - - test: - name: ${{ matrix.os }} ${{ matrix.env }} - runs-on: ${{ matrix.os }} - needs: detect-ci-trigger - if: needs.detect-ci-trigger.outputs.triggered == 'false' - defaults: - run: - shell: bash -l {0} - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - env: - [ - "no-xarray", - "no-dask", - "minimal-requirements", - ] - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 # Fetch all history for all branches and tags. - - - name: Set environment variables - run: | - echo "CONDA_ENV_FILE=ci/${{ matrix.env }}.yml" >> $GITHUB_ENV - - - name: Cache conda - uses: actions/cache@v3 - with: - path: ~/conda_pkgs_dir - key: - ${{ runner.os }}-conda-${{ matrix.env }}-${{ - hashFiles('ci/**.yml') }} - - - uses: conda-incubator/setup-miniconda@v2 - with: - channels: conda-forge - channel-priority: strict - mamba-version: "*" - activate-environment: flox-tests - auto-update-conda: false - python-version: 3.9 - use-only-tar-bz2: true - - - name: Install conda dependencies - run: | - mamba env update -f $CONDA_ENV_FILE - - - name: Install flox - run: | - python -m pip install --no-deps -e . - - - name: Version info - run: | - conda info -a - conda list - - name: Import flox - run: | - python -c "import flox" - - name: Run tests - run: | - python -m pytest \ - -n auto \ - --cov=flox \ - --cov-report=xml - - - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v3.1.0 - with: - file: ./coverage.xml - flags: unittests,${{ matrix.env }} - env_vars: RUNNER_OS - name: codecov-umbrella - fail_ci_if_error: false diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b22c249c7..4db8e6a8a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -32,40 +32,22 @@ jobs: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set environment variables run: | - echo "CONDA_ENV_FILE=ci/environment.yml" >> $GITHUB_ENV echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - - name: Cache conda - uses: actions/cache@v3 - with: - path: ~/conda_pkgs_dir - key: - ${{ runner.os }}-conda-py${{ matrix.python-version }}-${{ - hashFiles('ci/**.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - channels: conda-forge - channel-priority: strict - mamba-version: "*" - activate-environment: flox-tests - auto-update-conda: false - python-version: ${{ matrix.python-version }} - use-only-tar-bz2: true - - - name: Install conda dependencies - run: | - mamba env update -f $CONDA_ENV_FILE - - name: Set up conda environment - shell: bash -l {0} + uses: mamba-org/provision-with-micromamba@v12 + with: + environment-file: ci/environment.yml + environment-name: flox-tests + cache-env: true + extra-specs: | + python="${{ matrix.python-version }}" + - name: Install flox run: | python -m pip install -e . conda list - - name: Run Tests - shell: bash -l {0} run: | pytest -n auto --cov=./ --cov-report=xml - - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.0 with: @@ -75,26 +57,57 @@ jobs: name: codecov-umbrella fail_ci_if_error: false + optional-deps: + name: ${{ matrix.env }} + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + env: + [ + "no-xarray", + "no-dask", + "minimal-requirements", + ] + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 # Fetch all history for all branches and tags. + - name: Set up conda environment + uses: mamba-org/provision-with-micromamba@v12 + with: + environment-file: ci/${{ matrix.env }}.yml + environment-name: flox-tests + cache-env: true + extra-specs: | + python="${{ matrix.python-version }}" + - name: Install flox + run: | + python -m pip install --no-deps -e . + - name: Run tests + run: | + python -m pytest -n auto + upstream-dev: name: upstream-dev runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} steps: - uses: actions/checkout@v3 - - uses: conda-incubator/setup-miniconda@v2 - with: - channels: conda-forge - mamba-version: "*" - activate-environment: flox-tests - auto-update-conda: false - python-version: '3.10' - name: Set up conda environment - shell: bash -l {0} - run: | - mamba env update -f ci/upstream-dev-env.yml - python -m pip install -e . - conda list + uses: mamba-org/provision-with-micromamba@v12 + with: + environment-file: ci/upstream-dev-env.yml + environment-name: flox-tests + extra-specs: | + python="3.10" - name: Run Tests - shell: bash -l {0} run: | pytest -n 2 @@ -110,10 +123,11 @@ jobs: repository: 'pydata/xarray' fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@de032af7fb3675649f3d4bbdda85178ba412ee41 + uses: mamba-org/provision-with-micromamba@v12 with: environment-file: ci/requirements/environment.yml environment-name: xarray-tests + cache-env: true extra-specs: | python="3.10" - name: Install xarray From cd27a713ed710a8dd728a46f82c9cac563ea8b2e Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 9 Jul 2022 11:15:21 -0600 Subject: [PATCH 41/44] Update README.md --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bb2d241d5..e76e625fb 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,12 @@ about this package, from the Pangeo Showcase). ## Acknowledgements -This work was funded in part by NASA-ACCESS 80NSSC18M0156 "Community tools for analysis of NASA Earth Observing System -Data in the Cloud" (PI J. Hamman, NCAR), NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire) and [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). +This work was funded in part by +1. NASA-ACCESS 80NSSC18M0156 "Community tools for analysis of NASA Earth Observing System +Data in the Cloud" (PI J. Hamman, NCAR), +2. NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire), and +3. [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). + It was motivated by [very](https://github.com/pangeo-data/pangeo/issues/266) [very](https://github.com/pangeo-data/pangeo/issues/271) [many](https://github.com/dask/distributed/issues/2602) [discussions](https://github.com/pydata/xarray/issues/2237) in the [Pangeo](https://pangeo.io) community. ## API From 8039059bca4208a0261577d99dcb0e624f81b087 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 11 Jul 2022 10:09:33 -0600 Subject: [PATCH 42/44] Update README.md (#123) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e76e625fb..afa0195af 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,13 @@ -[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions)[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/dcherian/flox/main.svg)](https://results.pre-commit.ci/latest/github/dcherian/flox/main)[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox)[![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/)[![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox)[![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest)[![NASA-80NSSC18M0156](https://img.shields.io/badge/NASA-80NSSC18M0156-blue)](https://earthdata.nasa.gov/esds/competitive-programs/access/pangeo-ml)[![NASA-80NSSC22K0345](https://img.shields.io/badge/NASA-80NSSC22K0345-blue)](https://science.nasa.gov/open-science-overview) +[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions) +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/dcherian/flox/main.svg)](https://results.pre-commit.ci/latest/github/dcherian/flox/main) +[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox) +[![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest) + +[![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/) +[![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox) + +[![NASA-80NSSC18M0156](https://img.shields.io/badge/NASA-80NSSC18M0156-blue)](https://earthdata.nasa.gov/esds/competitive-programs/access/pangeo-ml) +[![NASA-80NSSC22K0345](https://img.shields.io/badge/NASA-80NSSC22K0345-blue)](https://science.nasa.gov/open-science-overview) # flox @@ -16,10 +25,10 @@ about this package, from the Pangeo Showcase). ## Acknowledgements -This work was funded in part by +This work was funded in part by 1. NASA-ACCESS 80NSSC18M0156 "Community tools for analysis of NASA Earth Observing System -Data in the Cloud" (PI J. Hamman, NCAR), -2. NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire), and +Data in the Cloud" (PI J. Hamman, NCAR), +2. NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire), and 3. [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). It was motivated by [very](https://github.com/pangeo-data/pangeo/issues/266) [very](https://github.com/pangeo-data/pangeo/issues/271) [many](https://github.com/dask/distributed/issues/2602) [discussions](https://github.com/pydata/xarray/issues/2237) in the [Pangeo](https://pangeo.io) community. From 681cf74bcecb2de6e3801c3e7952385650440600 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 11 Jul 2022 12:26:39 -0600 Subject: [PATCH 43/44] Propagate attrs better (#124) --- flox/xarray.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/flox/xarray.py b/flox/xarray.py index 15e2ff148..358b57abd 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -366,7 +366,7 @@ def wrapper(array, *by, func, skipna, **kwargs): if all(d not in ds[var].dims for d in dim): actual[var] = ds[var] - for name, expect in zip(group_names, expected_groups): + for name, expect, by_ in zip(group_names, expected_groups, by): # Can't remove this till xarray handles IntervalIndex if isinstance(expect, pd.IntervalIndex): expect = expect.to_numpy() @@ -382,6 +382,8 @@ def wrapper(array, *by, func, skipna, **kwargs): actual = actual.set_coords(levelnames) else: actual[name] = expect + if keep_attrs: + actual[name].attrs = by_.attrs if unindexed_dims: actual = actual.drop_vars(unindexed_dims) From bed52b1902b282c70a8355fd9a417045a5503985 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 12 Jul 2022 17:23:45 -0600 Subject: [PATCH 44/44] Rename repo to xarray-contrib/flox (#125) --- README.md | 6 +++--- ci/docs.yml | 2 +- docs/source/conf.py | 4 ++-- docs/source/index.md | 6 +++--- setup.cfg | 2 +- tests/test_xarray.py | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index afa0195af..7771ab9be 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions) -[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/dcherian/flox/main.svg)](https://results.pre-commit.ci/latest/github/dcherian/flox/main) -[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox) +[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/xarray-contrib/flox/CI?logo=github&style=flat)](https://github.com/xarray-contrib/flox/actions) +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/xarray-contrib/flox/main.svg)](https://results.pre-commit.ci/latest/github/xarray-contrib/flox/main) +[![image](https://img.shields.io/codecov/c/github/xarray-contrib/flox.svg?style=flat)](https://codecov.io/gh/xarray-contrib/flox) [![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest) [![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/) diff --git a/ci/docs.yml b/ci/docs.yml index e9c3f56ae..b11768282 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -16,4 +16,4 @@ dependencies: - ipykernel - jupyter - pip: - - git+https://github.com/dcherian/flox + - git+https://github.com/xarray-contrib/flox diff --git a/docs/source/conf.py b/docs/source/conf.py index 6fda3c575..8790c9502 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,8 +43,8 @@ ] extlinks = { - "issue": ("https://github.com/dcherian/flox/issues/%s", "GH#"), - "pr": ("https://github.com/dcherian/flox/pull/%s", "GH#"), + "issue": ("https://github.com/xarray-contrib/flox/issues/%s", "GH#"), + "pr": ("https://github.com/xarray-contrib/flox/pull/%s", "GH#"), } templates_path = ["_templates"] diff --git a/docs/source/index.md b/docs/source/index.md index 6149998b9..cf4c5c3ef 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -2,9 +2,9 @@ ## Overview -[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/dcherian/flox/CI?logo=github&style=flat)](https://github.com/dcherian/flox/actions) -[![GitHub Workflow Code Style Status](https://img.shields.io/github/workflow/status/dcherian/flox/code-style?label=Code%20Style&style=flat)](https://github.com/dcherian/flox/actions) -[![image](https://img.shields.io/codecov/c/github/dcherian/flox.svg?style=flat)](https://codecov.io/gh/dcherian/flox) +[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/xarray-contrib/flox/CI?logo=github&style=flat)](https://github.com/xarray-contrib/flox/actions) +[![GitHub Workflow Code Style Status](https://img.shields.io/github/workflow/status/xarray-contrib/flox/code-style?label=Code%20Style&style=flat)](https://github.com/xarray-contrib/flox/actions) +[![image](https://img.shields.io/codecov/c/github/xarray-contrib/flox.svg?style=flat)](https://codecov.io/gh/xarray-contrib/flox) [![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/) [![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox) diff --git a/setup.cfg b/setup.cfg index 72790e8a6..e99882db4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,7 @@ description = GroupBy operations for dask.array long_description = file: README.md long_description_content_type=text/markdown -url = https://github.com/dcherian/flox +url = https://github.com/xarray-contrib/flox classifiers = Development Status :: 4 - Beta License :: OSI Approved :: Apache Software License diff --git a/tests/test_xarray.py b/tests/test_xarray.py index fa54e4e54..0a696b24a 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -457,7 +457,7 @@ def test_groupby_bins_indexed_coordinate(): def test_mixed_grouping(chunk): if not has_dask and chunk: pytest.skip() - # regression test for https://github.com/dcherian/flox/pull/111 + # regression test for https://github.com/xarray-contrib/flox/pull/111 sa = 10 sb = 13 sc = 3