diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 24dcb8c9687..2332f7f236b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,11 @@ New Features ~~~~~~~~~~~~ - New "random" method for converting to and from 360_day calendars (:pull:`8603`). By `Pascal Bourgault `_. - +- Xarray now makes a best attempt not to coerce :py:class:`pandas.api.extensions.ExtensionArray` to a numpy array + by supporting 1D `ExtensionArray` objects internally where possible. Thus, `Dataset`s initialized with a `pd.Catgeorical`, + for example, will retain the object. However, one cannot do operations that are not possible on the `ExtensionArray` + then, such as broadcasting. + By `Ilan Gold `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -36,6 +40,12 @@ Bug fixes Internal Changes ~~~~~~~~~~~~~~~~ +- Migrates ``formatting_html`` functionality for `DataTree` into ``xarray/core`` (:pull: `8930`) + By `Eni Awowale `_, `Julia Signell `_ + and `Tom Nicholas `_. +- Migrates ``datatree_mapping`` functionality into ``xarray/core`` (:pull:`8948`) + By `Matt Savoie `_ `Owen Littlejohns + ` and `Tom Nicholas `_. .. _whats-new.2024.03.0: diff --git a/properties/test_pandas_roundtrip.py b/properties/test_pandas_roundtrip.py index 5c0f14976e6..3d87fcce1d9 100644 --- a/properties/test_pandas_roundtrip.py +++ b/properties/test_pandas_roundtrip.py @@ -17,7 +17,9 @@ from hypothesis import given # isort:skip numeric_dtypes = st.one_of( - npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes() + npst.unsigned_integer_dtypes(endianness="="), + npst.integer_dtypes(endianness="="), + npst.floating_dtypes(endianness="="), ) numeric_series = numeric_dtypes.flatmap(lambda dt: pdst.series(dtype=dt)) diff --git a/pyproject.toml b/pyproject.toml index bdbfd9b52ab..8cbd395b2a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ accel = ["scipy", "bottleneck", "numbagg", "flox", "opt_einsum"] complete = ["xarray[accel,io,parallel,viz,dev]"] dev = [ "hypothesis", + "mypy", "pre-commit", "pytest", "pytest-cov", @@ -86,8 +87,8 @@ exclude_lines = ["pragma: no cover", "if TYPE_CHECKING"] [tool.mypy] enable_error_code = "redundant-self" exclude = [ - 'xarray/util/generate_.*\.py', - 'xarray/datatree_/.*\.py', + 'xarray/util/generate_.*\.py', + 'xarray/datatree_/.*\.py', ] files = "xarray" show_error_codes = true @@ -98,8 +99,8 @@ warn_unused_ignores = true # Ignore mypy errors for modules imported from datatree_. [[tool.mypy.overrides]] -module = "xarray.datatree_.*" ignore_errors = true +module = "xarray.datatree_.*" # Much of the numerical computing stack doesn't have type annotations yet. [[tool.mypy.overrides]] @@ -129,6 +130,7 @@ module = [ "opt_einsum.*", "pandas.*", "pooch.*", + "pyarrow.*", "pydap.*", "pytest.*", "scipy.*", @@ -255,6 +257,9 @@ target-version = "py39" # E402: module level import not at top of file # E501: line too long - let black worry about that # E731: do not assign a lambda expression, use a def +extend-safe-fixes = [ + "TID252", # absolute imports +] ignore = [ "E402", "E501", @@ -268,9 +273,6 @@ select = [ "I", # isort "UP", # Pyupgrade ] -extend-safe-fixes = [ - "TID252", # absolute imports -] [tool.ruff.lint.per-file-ignores] # don't enforce absolute imports diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 900c10026fb..044273afc35 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -24,6 +24,7 @@ from typing import IO, TYPE_CHECKING, Any, Callable, Generic, Literal, cast, overload import numpy as np +from pandas.api.types import is_extension_array_dtype # remove once numpy 2.0 is the oldest supported version try: @@ -6853,10 +6854,13 @@ def reduce( if ( # Some reduction functions (e.g. std, var) need to run on variables # that don't have the reduce dims: PR5393 - not reduce_dims - or not numeric_only - or np.issubdtype(var.dtype, np.number) - or (var.dtype == np.bool_) + not is_extension_array_dtype(var.dtype) + and ( + not reduce_dims + or not numeric_only + or np.issubdtype(var.dtype, np.number) + or (var.dtype == np.bool_) + ) ): # prefer to aggregate over axis=None rather than # axis=(0, 1) if they will be equivalent, because @@ -7169,13 +7173,37 @@ def to_pandas(self) -> pd.Series | pd.DataFrame: ) def _to_dataframe(self, ordered_dims: Mapping[Any, int]): - columns = [k for k in self.variables if k not in self.dims] + columns_in_order = [k for k in self.variables if k not in self.dims] + non_extension_array_columns = [ + k + for k in columns_in_order + if not is_extension_array_dtype(self.variables[k].data) + ] + extension_array_columns = [ + k + for k in columns_in_order + if is_extension_array_dtype(self.variables[k].data) + ] data = [ self._variables[k].set_dims(ordered_dims).values.reshape(-1) - for k in columns + for k in non_extension_array_columns ] index = self.coords.to_index([*ordered_dims]) - return pd.DataFrame(dict(zip(columns, data)), index=index) + broadcasted_df = pd.DataFrame( + dict(zip(non_extension_array_columns, data)), index=index + ) + for extension_array_column in extension_array_columns: + extension_array = self.variables[extension_array_column].data.array + index = self[self.variables[extension_array_column].dims[0]].data + extension_array_df = pd.DataFrame( + {extension_array_column: extension_array}, + index=self[self.variables[extension_array_column].dims[0]].data, + ) + extension_array_df.index.name = self.variables[extension_array_column].dims[ + 0 + ] + broadcasted_df = broadcasted_df.join(extension_array_df) + return broadcasted_df[columns_in_order] def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame: """Convert this dataset into a pandas.DataFrame. @@ -7322,11 +7350,13 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self: "cannot convert a DataFrame with a non-unique MultiIndex into xarray" ) - # Cast to a NumPy array first, in case the Series is a pandas Extension - # array (which doesn't have a valid NumPy dtype) - # TODO: allow users to control how this casting happens, e.g., by - # forwarding arguments to pandas.Series.to_numpy? - arrays = [(k, np.asarray(v)) for k, v in dataframe.items()] + arrays = [] + extension_arrays = [] + for k, v in dataframe.items(): + if not is_extension_array_dtype(v): + arrays.append((k, np.asarray(v))) + else: + extension_arrays.append((k, v)) indexes: dict[Hashable, Index] = {} index_vars: dict[Hashable, Variable] = {} @@ -7340,6 +7370,8 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self: xr_idx = PandasIndex(lev, dim) indexes[dim] = xr_idx index_vars.update(xr_idx.create_variables()) + arrays += [(k, np.asarray(v)) for k, v in extension_arrays] + extension_arrays = [] else: index_name = idx.name if idx.name is not None else "index" dims = (index_name,) @@ -7353,7 +7385,9 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self: obj._set_sparse_data_from_dataframe(idx, arrays, dims) else: obj._set_numpy_data_from_dataframe(idx, arrays, dims) - return obj + for name, extension_array in extension_arrays: + obj[name] = (dims, extension_array) + return obj[dataframe.columns] if len(dataframe.columns) else obj def to_dask_dataframe( self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 1b06d87c9fb..57fd7222898 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -18,6 +18,14 @@ from xarray.core.coordinates import DatasetCoordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, DataVariables +from xarray.core.datatree_mapping import ( + TreeIsomorphismError, + check_isomorphic, + map_over_subtree, +) +from xarray.core.formatting_html import ( + datatree_repr as datatree_repr_html, +) from xarray.core.indexes import Index, Indexes from xarray.core.merge import dataset_update_method from xarray.core.options import OPTIONS as XR_OPTS @@ -33,14 +41,6 @@ from xarray.core.variable import Variable from xarray.datatree_.datatree.common import TreeAttrAccessMixin from xarray.datatree_.datatree.formatting import datatree_repr -from xarray.datatree_.datatree.formatting_html import ( - datatree_repr as datatree_repr_html, -) -from xarray.datatree_.datatree.mapping import ( - TreeIsomorphismError, - check_isomorphic, - map_over_subtree, -) from xarray.datatree_.datatree.ops import ( DataTreeArithmeticMixin, MappedDatasetMethodsMixin, diff --git a/xarray/datatree_/datatree/mapping.py b/xarray/core/datatree_mapping.py similarity index 94% rename from xarray/datatree_/datatree/mapping.py rename to xarray/core/datatree_mapping.py index 9546905e1ac..714921d2a90 100644 --- a/xarray/datatree_/datatree/mapping.py +++ b/xarray/core/datatree_mapping.py @@ -4,10 +4,9 @@ import sys from itertools import repeat from textwrap import dedent -from typing import TYPE_CHECKING, Callable, Tuple +from typing import TYPE_CHECKING, Callable from xarray import DataArray, Dataset - from xarray.core.iterators import LevelOrderIter from xarray.core.treenode import NodePath, TreeNode @@ -84,14 +83,13 @@ def diff_treestructure(a: DataTree, b: DataTree, require_names_equal: bool) -> s for node_a, node_b in zip(LevelOrderIter(a), LevelOrderIter(b)): path_a, path_b = node_a.path, node_b.path - if require_names_equal: - if node_a.name != node_b.name: - diff = dedent( - f"""\ + if require_names_equal and node_a.name != node_b.name: + diff = dedent( + f"""\ Node '{path_a}' in the left object has name '{node_a.name}' Node '{path_b}' in the right object has name '{node_b.name}'""" - ) - return diff + ) + return diff if len(node_a.children) != len(node_b.children): diff = dedent( @@ -125,7 +123,7 @@ def map_over_subtree(func: Callable) -> Callable: func : callable Function to apply to datasets with signature: - `func(*args, **kwargs) -> Union[Dataset, Iterable[Dataset]]`. + `func(*args, **kwargs) -> Union[DataTree, Iterable[DataTree]]`. (i.e. func must accept at least one Dataset and return at least one Dataset.) Function will not be applied to any nodes without datasets. @@ -154,7 +152,7 @@ def map_over_subtree(func: Callable) -> Callable: # TODO inspect function to work out immediately if the wrong number of arguments were passed for it? @functools.wraps(func) - def _map_over_subtree(*args, **kwargs) -> DataTree | Tuple[DataTree, ...]: + def _map_over_subtree(*args, **kwargs) -> DataTree | tuple[DataTree, ...]: """Internal function which maps func over every node in tree, returning a tree of the results.""" from xarray.core.datatree import DataTree @@ -259,7 +257,7 @@ def _map_over_subtree(*args, **kwargs) -> DataTree | Tuple[DataTree, ...]: return _map_over_subtree -def _handle_errors_with_path_context(path): +def _handle_errors_with_path_context(path: str): """Wraps given function so that if it fails it also raises path to node on which it failed.""" def decorator(func): @@ -267,11 +265,10 @@ def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: - if sys.version_info >= (3, 11): - # Add the context information to the error message - e.add_note( - f"Raised whilst mapping function over node with path {path}" - ) + # Add the context information to the error message + add_note( + e, f"Raised whilst mapping function over node with path {path}" + ) raise return wrapper @@ -287,7 +284,9 @@ def add_note(err: BaseException, msg: str) -> None: err.add_note(msg) -def _check_single_set_return_values(path_to_node, obj): +def _check_single_set_return_values( + path_to_node: str, obj: Dataset | DataArray | tuple[Dataset | DataArray] +): """Check types returned from single evaluation of func, and return number of return values received from func.""" if isinstance(obj, (Dataset, DataArray)): return 1 diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index ef497e78ebf..d95dfa566cc 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -32,6 +32,7 @@ from numpy import concatenate as _concatenate from numpy.lib.stride_tricks import sliding_window_view # noqa from packaging.version import Version +from pandas.api.types import is_extension_array_dtype from xarray.core import dask_array_ops, dtypes, nputils from xarray.core.options import OPTIONS @@ -156,7 +157,7 @@ def isnull(data): return full_like(data, dtype=bool, fill_value=False) else: # at this point, array should have dtype=object - if isinstance(data, np.ndarray): + if isinstance(data, np.ndarray) or is_extension_array_dtype(data): return pandas_isnull(data) else: # Not reachable yet, but intended for use with other duck array @@ -221,9 +222,19 @@ def asarray(data, xp=np): def as_shared_dtype(scalars_or_arrays, xp=np): """Cast a arrays to a shared dtype using xarray's type promotion rules.""" - array_type_cupy = array_type("cupy") - if array_type_cupy and any( - isinstance(x, array_type_cupy) for x in scalars_or_arrays + if any(is_extension_array_dtype(x) for x in scalars_or_arrays): + extension_array_types = [ + x.dtype for x in scalars_or_arrays if is_extension_array_dtype(x) + ] + if len(extension_array_types) == len(scalars_or_arrays) and all( + isinstance(x, type(extension_array_types[0])) for x in extension_array_types + ): + return scalars_or_arrays + raise ValueError( + f"Cannot cast arrays to shared type, found array types {[x.dtype for x in scalars_or_arrays]}" + ) + elif array_type_cupy := array_type("cupy") and any( # noqa: F841 + isinstance(x, array_type_cupy) for x in scalars_or_arrays # noqa: F821 ): import cupy as cp diff --git a/xarray/core/extension_array.py b/xarray/core/extension_array.py new file mode 100644 index 00000000000..6521e425615 --- /dev/null +++ b/xarray/core/extension_array.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +from collections.abc import Sequence +from typing import Callable, Generic + +import numpy as np +import pandas as pd +from pandas.api.types import is_extension_array_dtype + +from xarray.core.types import DTypeLikeSave, T_ExtensionArray + +HANDLED_EXTENSION_ARRAY_FUNCTIONS: dict[Callable, Callable] = {} + + +def implements(numpy_function): + """Register an __array_function__ implementation for MyArray objects.""" + + def decorator(func): + HANDLED_EXTENSION_ARRAY_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@implements(np.issubdtype) +def __extension_duck_array__issubdtype( + extension_array_dtype: T_ExtensionArray, other_dtype: DTypeLikeSave +) -> bool: + return False # never want a function to think a pandas extension dtype is a subtype of numpy + + +@implements(np.broadcast_to) +def __extension_duck_array__broadcast(arr: T_ExtensionArray, shape: tuple): + if shape[0] == len(arr) and len(shape) == 1: + return arr + raise NotImplementedError("Cannot broadcast 1d-only pandas categorical array.") + + +@implements(np.stack) +def __extension_duck_array__stack(arr: T_ExtensionArray, axis: int): + raise NotImplementedError("Cannot stack 1d-only pandas categorical array.") + + +@implements(np.concatenate) +def __extension_duck_array__concatenate( + arrays: Sequence[T_ExtensionArray], axis: int = 0, out=None +) -> T_ExtensionArray: + return type(arrays[0])._concat_same_type(arrays) + + +@implements(np.where) +def __extension_duck_array__where( + condition: np.ndarray, x: T_ExtensionArray, y: T_ExtensionArray +) -> T_ExtensionArray: + if ( + isinstance(x, pd.Categorical) + and isinstance(y, pd.Categorical) + and x.dtype != y.dtype + ): + x = x.add_categories(set(y.categories).difference(set(x.categories))) + y = y.add_categories(set(x.categories).difference(set(y.categories))) + return pd.Series(x).where(condition, pd.Series(y)).array + + +class PandasExtensionArray(Generic[T_ExtensionArray]): + array: T_ExtensionArray + + def __init__(self, array: T_ExtensionArray): + """NEP-18 compliant wrapper for pandas extension arrays. + + Parameters + ---------- + array : T_ExtensionArray + The array to be wrapped upon e.g,. :py:class:`xarray.Variable` creation. + ``` + """ + if not isinstance(array, pd.api.extensions.ExtensionArray): + raise TypeError(f"{array} is not an pandas ExtensionArray.") + self.array = array + + def __array_function__(self, func, types, args, kwargs): + def replace_duck_with_extension_array(args) -> list: + args_as_list = list(args) + for index, value in enumerate(args_as_list): + if isinstance(value, PandasExtensionArray): + args_as_list[index] = value.array + elif isinstance( + value, tuple + ): # should handle more than just tuple? iterable? + args_as_list[index] = tuple( + replace_duck_with_extension_array(value) + ) + elif isinstance(value, list): + args_as_list[index] = replace_duck_with_extension_array(value) + return args_as_list + + args = tuple(replace_duck_with_extension_array(args)) + if func not in HANDLED_EXTENSION_ARRAY_FUNCTIONS: + return func(*args, **kwargs) + res = HANDLED_EXTENSION_ARRAY_FUNCTIONS[func](*args, **kwargs) + if is_extension_array_dtype(res): + return type(self)[type(res)](res) + return res + + def __array_ufunc__(ufunc, method, *inputs, **kwargs): + return ufunc(*inputs, **kwargs) + + def __repr__(self): + return f"{type(self)}(array={repr(self.array)})" + + def __getattr__(self, attr: str) -> object: + return getattr(self.array, attr) + + def __getitem__(self, key) -> PandasExtensionArray[T_ExtensionArray]: + item = self.array[key] + if is_extension_array_dtype(item): + return type(self)(item) + if np.isscalar(item): + return type(self)(type(self.array)([item])) + return item + + def __setitem__(self, key, val): + self.array[key] = val + + def __eq__(self, other): + if np.isscalar(other): + other = type(self)(type(self.array)([other])) + if isinstance(other, PandasExtensionArray): + return self.array == other.array + return self.array == other + + def __ne__(self, other): + return ~(self == other) + + def __len__(self): + return len(self.array) diff --git a/xarray/core/formatting_html.py b/xarray/core/formatting_html.py index 2c76b182207..9bf5befbe3f 100644 --- a/xarray/core/formatting_html.py +++ b/xarray/core/formatting_html.py @@ -2,9 +2,11 @@ import uuid from collections import OrderedDict +from collections.abc import Mapping from functools import lru_cache, partial from html import escape from importlib.resources import files +from typing import TYPE_CHECKING from xarray.core.formatting import ( inline_index_repr, @@ -18,6 +20,9 @@ ("xarray.static.css", "style.css"), ) +if TYPE_CHECKING: + from xarray.core.datatree import DataTree + @lru_cache(None) def _load_static_files(): @@ -341,3 +346,129 @@ def dataset_repr(ds) -> str: ] return _obj_repr(ds, header_components, sections) + + +def summarize_datatree_children(children: Mapping[str, DataTree]) -> str: + N_CHILDREN = len(children) - 1 + + # Get result from datatree_node_repr and wrap it + lines_callback = lambda n, c, end: _wrap_datatree_repr( + datatree_node_repr(n, c), end=end + ) + + children_html = "".join( + ( + lines_callback(n, c, end=False) # Long lines + if i < N_CHILDREN + else lines_callback(n, c, end=True) + ) # Short lines + for i, (n, c) in enumerate(children.items()) + ) + + return "".join( + [ + "
", + children_html, + "
", + ] + ) + + +children_section = partial( + _mapping_section, + name="Groups", + details_func=summarize_datatree_children, + max_items_collapse=1, + expand_option_name="display_expand_groups", +) + + +def datatree_node_repr(group_title: str, dt: DataTree) -> str: + header_components = [f"
{escape(group_title)}
"] + + ds = dt.ds + + sections = [ + children_section(dt.children), + dim_section(ds), + coord_section(ds.coords), + datavar_section(ds.data_vars), + attr_section(ds.attrs), + ] + + return _obj_repr(ds, header_components, sections) + + +def _wrap_datatree_repr(r: str, end: bool = False) -> str: + """ + Wrap HTML representation with a tee to the left of it. + + Enclosing HTML tag is a
with :code:`display: inline-grid` style. + + Turns: + [ title ] + | details | + |_____________| + + into (A): + |─ [ title ] + | | details | + | |_____________| + + or (B): + └─ [ title ] + | details | + |_____________| + + Parameters + ---------- + r: str + HTML representation to wrap. + end: bool + Specify if the line on the left should continue or end. + + Default is True. + + Returns + ------- + str + Wrapped HTML representation. + + Tee color is set to the variable :code:`--xr-border-color`. + """ + # height of line + end = bool(end) + height = "100%" if end is False else "1.2em" + return "".join( + [ + "
", + "
", + "
", + "
", + "
", + "
", + r, + "
", + "
", + ] + ) + + +def datatree_repr(dt: DataTree) -> str: + obj_type = f"datatree.{type(dt).__name__}" + return datatree_node_repr(obj_type, dt) diff --git a/xarray/core/iterators.py b/xarray/core/iterators.py index 5c0c0f652e8..dd5fa7ee97a 100644 --- a/xarray/core/iterators.py +++ b/xarray/core/iterators.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections import abc from collections.abc import Iterator from typing import Callable @@ -9,7 +8,7 @@ """These iterators are copied from anytree.iterators, with minor modifications.""" -class LevelOrderIter(abc.Iterator): +class LevelOrderIter(Iterator): """Iterate over tree applying level-order strategy starting at `node`. This is the iterator used by `DataTree` to traverse nodes. diff --git a/xarray/core/types.py b/xarray/core/types.py index 242aed240fb..17b06eb0805 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -173,6 +173,9 @@ def copy( # hopefully in the future we can narrow this down more: T_DuckArray = TypeVar("T_DuckArray", bound=Any, covariant=True) +# For typing pandas extension arrays. +T_ExtensionArray = TypeVar("T_ExtensionArray", bound=pd.api.extensions.ExtensionArray) + ScalarOrArray = Union["ArrayLike", np.generic, np.ndarray, "DaskArray"] VarCompatible = Union["Variable", "ScalarOrArray"] diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e89cf95411c..2229eaa2d24 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -13,11 +13,13 @@ import numpy as np import pandas as pd from numpy.typing import ArrayLike +from pandas.api.types import is_extension_array_dtype import xarray as xr # only for Dataset and DataArray from xarray.core import common, dtypes, duck_array_ops, indexing, nputils, ops, utils from xarray.core.arithmetic import VariableArithmetic from xarray.core.common import AbstractArray +from xarray.core.extension_array import PandasExtensionArray from xarray.core.indexing import ( BasicIndexer, OuterIndexer, @@ -47,6 +49,7 @@ NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( indexing.ExplicitlyIndexed, pd.Index, + pd.api.extensions.ExtensionArray, ) # https://github.com/python/mypy/issues/224 BASIC_INDEXING_TYPES = integer_types + (slice,) @@ -184,6 +187,8 @@ def _maybe_wrap_data(data): """ if isinstance(data, pd.Index): return PandasIndexingAdapter(data) + if isinstance(data, pd.api.extensions.ExtensionArray): + return PandasExtensionArray[type(data)](data) return data @@ -2570,6 +2575,11 @@ def chunk( # type: ignore[override] dask.array.from_array """ + if is_extension_array_dtype(self): + raise ValueError( + f"{self} was found to be a Pandas ExtensionArray. Please convert to numpy first." + ) + if from_array_kwargs is None: from_array_kwargs = {} diff --git a/xarray/datatree_/datatree/__init__.py b/xarray/datatree_/datatree/__init__.py index f2603b64641..3159d612913 100644 --- a/xarray/datatree_/datatree/__init__.py +++ b/xarray/datatree_/datatree/__init__.py @@ -1,11 +1,8 @@ # import public API -from .mapping import TreeIsomorphismError, map_over_subtree from xarray.core.treenode import InvalidTreeError, NotFoundInTreeError __all__ = ( - "TreeIsomorphismError", "InvalidTreeError", "NotFoundInTreeError", - "map_over_subtree", ) diff --git a/xarray/datatree_/datatree/formatting.py b/xarray/datatree_/datatree/formatting.py index 9ebee72d4ef..fdd23933ae6 100644 --- a/xarray/datatree_/datatree/formatting.py +++ b/xarray/datatree_/datatree/formatting.py @@ -2,7 +2,7 @@ from xarray.core.formatting import _compat_to_str, diff_dataset_repr -from xarray.datatree_.datatree.mapping import diff_treestructure +from xarray.core.datatree_mapping import diff_treestructure from xarray.datatree_.datatree.render import RenderTree if TYPE_CHECKING: diff --git a/xarray/datatree_/datatree/formatting_html.py b/xarray/datatree_/datatree/formatting_html.py deleted file mode 100644 index 547b567a396..00000000000 --- a/xarray/datatree_/datatree/formatting_html.py +++ /dev/null @@ -1,135 +0,0 @@ -from functools import partial -from html import escape -from typing import Any, Mapping - -from xarray.core.formatting_html import ( - _mapping_section, - _obj_repr, - attr_section, - coord_section, - datavar_section, - dim_section, -) - - -def summarize_children(children: Mapping[str, Any]) -> str: - N_CHILDREN = len(children) - 1 - - # Get result from node_repr and wrap it - lines_callback = lambda n, c, end: _wrap_repr(node_repr(n, c), end=end) - - children_html = "".join( - lines_callback(n, c, end=False) # Long lines - if i < N_CHILDREN - else lines_callback(n, c, end=True) # Short lines - for i, (n, c) in enumerate(children.items()) - ) - - return "".join( - [ - "
", - children_html, - "
", - ] - ) - - -children_section = partial( - _mapping_section, - name="Groups", - details_func=summarize_children, - max_items_collapse=1, - expand_option_name="display_expand_groups", -) - - -def node_repr(group_title: str, dt: Any) -> str: - header_components = [f"
{escape(group_title)}
"] - - ds = dt.ds - - sections = [ - children_section(dt.children), - dim_section(ds), - coord_section(ds.coords), - datavar_section(ds.data_vars), - attr_section(ds.attrs), - ] - - return _obj_repr(ds, header_components, sections) - - -def _wrap_repr(r: str, end: bool = False) -> str: - """ - Wrap HTML representation with a tee to the left of it. - - Enclosing HTML tag is a
with :code:`display: inline-grid` style. - - Turns: - [ title ] - | details | - |_____________| - - into (A): - |─ [ title ] - | | details | - | |_____________| - - or (B): - └─ [ title ] - | details | - |_____________| - - Parameters - ---------- - r: str - HTML representation to wrap. - end: bool - Specify if the line on the left should continue or end. - - Default is True. - - Returns - ------- - str - Wrapped HTML representation. - - Tee color is set to the variable :code:`--xr-border-color`. - """ - # height of line - end = bool(end) - height = "100%" if end is False else "1.2em" - return "".join( - [ - "
", - "
", - "
", - "
", - "
", - "
", - "
    ", - r, - "
" "
", - "
", - ] - ) - - -def datatree_repr(dt: Any) -> str: - obj_type = f"datatree.{type(dt).__name__}" - return node_repr(obj_type, dt) diff --git a/xarray/datatree_/datatree/ops.py b/xarray/datatree_/datatree/ops.py index d6ac4f83e7c..83b9d1b275a 100644 --- a/xarray/datatree_/datatree/ops.py +++ b/xarray/datatree_/datatree/ops.py @@ -2,7 +2,7 @@ from xarray import Dataset -from .mapping import map_over_subtree +from xarray.core.datatree_mapping import map_over_subtree """ Module which specifies the subset of xarray.Dataset's API which we wish to copy onto DataTree. diff --git a/xarray/datatree_/datatree/tests/test_formatting_html.py b/xarray/datatree_/datatree/tests/test_formatting_html.py deleted file mode 100644 index 98cdf02bff4..00000000000 --- a/xarray/datatree_/datatree/tests/test_formatting_html.py +++ /dev/null @@ -1,198 +0,0 @@ -import pytest -import xarray as xr - -from xarray.core.datatree import DataTree -from xarray.datatree_.datatree import formatting_html - - -@pytest.fixture(scope="module", params=["some html", "some other html"]) -def repr(request): - return request.param - - -class Test_summarize_children: - """ - Unit tests for summarize_children. - """ - - func = staticmethod(formatting_html.summarize_children) - - @pytest.fixture(scope="class") - def childfree_tree_factory(self): - """ - Fixture for a child-free DataTree factory. - """ - from random import randint - - def _childfree_tree_factory(): - return DataTree( - data=xr.Dataset({"z": ("y", [randint(1, 100) for _ in range(3)])}) - ) - - return _childfree_tree_factory - - @pytest.fixture(scope="class") - def childfree_tree(self, childfree_tree_factory): - """ - Fixture for a child-free DataTree. - """ - return childfree_tree_factory() - - @pytest.fixture(scope="function") - def mock_node_repr(self, monkeypatch): - """ - Apply mocking for node_repr. - """ - - def mock(group_title, dt): - """ - Mock with a simple result - """ - return group_title + " " + str(id(dt)) - - monkeypatch.setattr(formatting_html, "node_repr", mock) - - @pytest.fixture(scope="function") - def mock_wrap_repr(self, monkeypatch): - """ - Apply mocking for _wrap_repr. - """ - - def mock(r, *, end, **kwargs): - """ - Mock by appending "end" or "not end". - """ - return r + " " + ("end" if end else "not end") + "//" - - monkeypatch.setattr(formatting_html, "_wrap_repr", mock) - - def test_empty_mapping(self): - """ - Test with an empty mapping of children. - """ - children = {} - assert self.func(children) == ( - "
" "
" - ) - - def test_one_child(self, childfree_tree, mock_wrap_repr, mock_node_repr): - """ - Test with one child. - - Uses a mock of _wrap_repr and node_repr to essentially mock - the inline lambda function "lines_callback". - """ - # Create mapping of children - children = {"a": childfree_tree} - - # Expect first line to be produced from the first child, and - # wrapped as the last child - first_line = f"a {id(children['a'])} end//" - - assert self.func(children) == ( - "
" - f"{first_line}" - "
" - ) - - def test_two_children(self, childfree_tree_factory, mock_wrap_repr, mock_node_repr): - """ - Test with two level deep children. - - Uses a mock of _wrap_repr and node_repr to essentially mock - the inline lambda function "lines_callback". - """ - - # Create mapping of children - children = {"a": childfree_tree_factory(), "b": childfree_tree_factory()} - - # Expect first line to be produced from the first child, and - # wrapped as _not_ the last child - first_line = f"a {id(children['a'])} not end//" - - # Expect second line to be produced from the second child, and - # wrapped as the last child - second_line = f"b {id(children['b'])} end//" - - assert self.func(children) == ( - "
" - f"{first_line}" - f"{second_line}" - "
" - ) - - -class Test__wrap_repr: - """ - Unit tests for _wrap_repr. - """ - - func = staticmethod(formatting_html._wrap_repr) - - def test_end(self, repr): - """ - Test with end=True. - """ - r = self.func(repr, end=True) - assert r == ( - "
" - "
" - "
" - "
" - "
" - "
" - "
    " - f"{repr}" - "
" - "
" - "
" - ) - - def test_not_end(self, repr): - """ - Test with end=False. - """ - r = self.func(repr, end=False) - assert r == ( - "
" - "
" - "
" - "
" - "
" - "
" - "
    " - f"{repr}" - "
" - "
" - "
" - ) diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index d2503dfd535..449d0c793cc 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -45,7 +45,7 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]: Generates only those numpy dtypes which xarray can handle. Use instead of hypothesis.extra.numpy.scalar_dtypes in order to exclude weirder dtypes such as unicode, byte_string, array, or nested dtypes. - Also excludes datetimes, which dodges bugs with pandas non-nanosecond datetime overflows. + Also excludes datetimes, which dodges bugs with pandas non-nanosecond datetime overflows. Checks only native endianness. Requires the hypothesis package to be installed. @@ -56,10 +56,10 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]: # TODO should this be exposed publicly? # We should at least decide what the set of numpy dtypes that xarray officially supports is. return ( - npst.integer_dtypes() - | npst.unsigned_integer_dtypes() - | npst.floating_dtypes() - | npst.complex_number_dtypes() + npst.integer_dtypes(endianness="=") + | npst.unsigned_integer_dtypes(endianness="=") + | npst.floating_dtypes(endianness="=") + | npst.complex_number_dtypes(endianness="=") # | npst.datetime64_dtypes() # | npst.timedelta64_dtypes() # | npst.unicode_string_dtypes() diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 5007db9eeb2..3ce788dfb7f 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -18,6 +18,7 @@ from xarray import Dataset from xarray.core import utils from xarray.core.duck_array_ops import allclose_or_equiv # noqa: F401 +from xarray.core.extension_array import PandasExtensionArray from xarray.core.indexing import ExplicitlyIndexed from xarray.core.options import set_options from xarray.core.variable import IndexVariable @@ -52,7 +53,9 @@ def assert_writeable(ds): readonly = [ name for name, var in ds.variables.items() - if not isinstance(var, IndexVariable) and not var.data.flags.writeable + if not isinstance(var, IndexVariable) + and not isinstance(var.data, PandasExtensionArray) + and not var.data.flags.writeable ] assert not readonly, readonly @@ -112,6 +115,7 @@ def _importorskip( has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg", "0.4.0") +has_pyarrow, requires_pyarrow = _importorskip("pyarrow") with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -307,6 +311,7 @@ def create_test_data( seed: int | None = None, add_attrs: bool = True, dim_sizes: tuple[int, int, int] = _DEFAULT_TEST_DIM_SIZES, + use_extension_array: bool = False, ) -> Dataset: rs = np.random.RandomState(seed) _vars = { @@ -329,7 +334,16 @@ def create_test_data( obj[v] = (dims, data) if add_attrs: obj[v].attrs = {"foo": "variable"} - + if use_extension_array: + obj["var4"] = ( + "dim1", + pd.Categorical( + np.random.choice( + list(string.ascii_lowercase[: np.random.randint(5)]), + size=dim_sizes[0], + ) + ), + ) if dim_sizes == _DEFAULT_TEST_DIM_SIZES: numbers_values = np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64") else: diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 0cf4cc03a09..1ddb5a569bd 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -152,6 +152,21 @@ def test_concat_missing_var() -> None: assert_identical(actual, expected) +def test_concat_categorical() -> None: + data1 = create_test_data(use_extension_array=True) + data2 = create_test_data(use_extension_array=True) + concatenated = concat([data1, data2], dim="dim1") + assert ( + concatenated["var4"] + == type(data2["var4"].variable.data.array)._concat_same_type( + [ + data1["var4"].variable.data.array, + data2["var4"].variable.data.array, + ] + ) + ).all() + + def test_concat_missing_multiple_consecutive_var() -> None: datasets = create_concat_datasets(3, seed=123) expected = concat(datasets, dim="day") @@ -451,8 +466,11 @@ def test_concat_fill_missing_variables( class TestConcatDataset: @pytest.fixture - def data(self) -> Dataset: - return create_test_data().drop_dims("dim3") + def data(self, request) -> Dataset: + use_extension_array = request.param if hasattr(request, "param") else False + return create_test_data(use_extension_array=use_extension_array).drop_dims( + "dim3" + ) def rectify_dim_order(self, data, dataset) -> Dataset: # return a new dataset with all variable dimensions transposed into @@ -464,7 +482,9 @@ def rectify_dim_order(self, data, dataset) -> Dataset: ) @pytest.mark.parametrize("coords", ["different", "minimal"]) - @pytest.mark.parametrize("dim", ["dim1", "dim2"]) + @pytest.mark.parametrize( + "dim,data", [["dim1", True], ["dim2", False]], indirect=["data"] + ) def test_concat_simple(self, data, dim, coords) -> None: datasets = [g for _, g in data.groupby(dim, squeeze=False)] assert_identical(data, concat(datasets, dim, coords=coords)) @@ -492,6 +512,7 @@ def test_concat_merge_variables_present_in_some_datasets(self, data) -> None: expected = data.copy().assign(foo=(["dim1", "bar"], foo)) assert_identical(expected, actual) + @pytest.mark.parametrize("data", [False], indirect=["data"]) def test_concat_2(self, data) -> None: dim = "dim2" datasets = [g.squeeze(dim) for _, g in data.groupby(dim, squeeze=False)] diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e2a64964775..a948fafc815 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4614,10 +4614,12 @@ def test_to_and_from_dataframe(self) -> None: x = np.random.randn(10) y = np.random.randn(10) t = list("abcdefghij") - ds = Dataset({"a": ("t", x), "b": ("t", y), "t": ("t", t)}) + cat = pd.Categorical(["a", "b"] * 5) + ds = Dataset({"a": ("t", x), "b": ("t", y), "t": ("t", t), "cat": ("t", cat)}) expected = pd.DataFrame( np.array([x, y]).T, columns=["a", "b"], index=pd.Index(t, name="t") ) + expected["cat"] = cat actual = ds.to_dataframe() # use the .equals method to check all DataFrame metadata assert expected.equals(actual), (expected, actual) @@ -4628,23 +4630,31 @@ def test_to_and_from_dataframe(self) -> None: # check roundtrip assert_identical(ds, Dataset.from_dataframe(actual)) - + assert isinstance(ds["cat"].variable.data.dtype, pd.CategoricalDtype) # test a case with a MultiIndex w = np.random.randn(2, 3) - ds = Dataset({"w": (("x", "y"), w)}) + cat = pd.Categorical(["a", "a", "c"]) + ds = Dataset({"w": (("x", "y"), w), "cat": ("y", cat)}) ds["y"] = ("y", list("abc")) exp_index = pd.MultiIndex.from_arrays( [[0, 0, 0, 1, 1, 1], ["a", "b", "c", "a", "b", "c"]], names=["x", "y"] ) - expected = pd.DataFrame(w.reshape(-1), columns=["w"], index=exp_index) + expected = pd.DataFrame( + {"w": w.reshape(-1), "cat": pd.Categorical(["a", "a", "c", "a", "a", "c"])}, + index=exp_index, + ) actual = ds.to_dataframe() assert expected.equals(actual) # check roundtrip + # from_dataframe attempts to broadcast across because it doesn't know better, so cat must be converted + ds["cat"] = (("x", "y"), np.stack((ds["cat"].to_numpy(), ds["cat"].to_numpy()))) assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual)) # Check multiindex reordering new_order = ["x", "y"] + # revert broadcasting fix above for 1d arrays + ds["cat"] = ("y", cat) actual = ds.to_dataframe(dim_order=new_order) assert expected.equals(actual) @@ -4653,7 +4663,11 @@ def test_to_and_from_dataframe(self) -> None: [["a", "a", "b", "b", "c", "c"], [0, 1, 0, 1, 0, 1]], names=["y", "x"] ) expected = pd.DataFrame( - w.transpose().reshape(-1), columns=["w"], index=exp_index + { + "w": w.transpose().reshape(-1), + "cat": pd.Categorical(["a", "a", "a", "a", "c", "c"]), + }, + index=exp_index, ) actual = ds.to_dataframe(dim_order=new_order) assert expected.equals(actual) @@ -4706,7 +4720,7 @@ def test_to_and_from_dataframe(self) -> None: expected = pd.DataFrame([[]], index=idx) assert expected.equals(actual), (expected, actual) - def test_from_dataframe_categorical(self) -> None: + def test_from_dataframe_categorical_index(self) -> None: cat = pd.CategoricalDtype( categories=["foo", "bar", "baz", "qux", "quux", "corge"] ) @@ -4721,7 +4735,7 @@ def test_from_dataframe_categorical(self) -> None: assert len(ds["i1"]) == 2 assert len(ds["i2"]) == 2 - def test_from_dataframe_categorical_string_categories(self) -> None: + def test_from_dataframe_categorical_index_string_categories(self) -> None: cat = pd.CategoricalIndex( pd.Categorical.from_codes( np.array([1, 1, 0, 2]), @@ -5449,18 +5463,22 @@ def test_reduce_cumsum_test_dims(self, reduct, expected, func) -> None: assert list(actual) == expected def test_reduce_non_numeric(self) -> None: - data1 = create_test_data(seed=44) + data1 = create_test_data(seed=44, use_extension_array=True) data2 = create_test_data(seed=44) - add_vars = {"var4": ["dim1", "dim2"], "var5": ["dim1"]} + add_vars = {"var5": ["dim1", "dim2"], "var6": ["dim1"]} for v, dims in sorted(add_vars.items()): size = tuple(data1.sizes[d] for d in dims) data = np.random.randint(0, 100, size=size).astype(np.str_) data1[v] = (dims, data, {"foo": "variable"}) - - assert "var4" not in data1.mean() and "var5" not in data1.mean() + # var4 is extension array categorical and should be dropped + assert ( + "var4" not in data1.mean() + and "var5" not in data1.mean() + and "var6" not in data1.mean() + ) assert_equal(data1.mean(), data2.mean()) assert_equal(data1.mean(dim="dim1"), data2.mean(dim="dim1")) - assert "var4" not in data1.mean(dim="dim2") and "var5" in data1.mean(dim="dim2") + assert "var5" not in data1.mean(dim="dim2") and "var6" in data1.mean(dim="dim2") @pytest.mark.filterwarnings( "ignore:Once the behaviour of DataArray:DeprecationWarning" diff --git a/xarray/datatree_/datatree/tests/test_mapping.py b/xarray/tests/test_datatree_mapping.py similarity index 98% rename from xarray/datatree_/datatree/tests/test_mapping.py rename to xarray/tests/test_datatree_mapping.py index c6cd04887c0..16ca726759d 100644 --- a/xarray/datatree_/datatree/tests/test_mapping.py +++ b/xarray/tests/test_datatree_mapping.py @@ -1,9 +1,13 @@ import numpy as np import pytest -import xarray as xr +import xarray as xr from xarray.core.datatree import DataTree -from xarray.datatree_.datatree.mapping import TreeIsomorphismError, check_isomorphic, map_over_subtree +from xarray.core.datatree_mapping import ( + TreeIsomorphismError, + check_isomorphic, + map_over_subtree, +) from xarray.datatree_.datatree.testing import assert_equal empty = xr.Dataset() @@ -12,7 +16,7 @@ class TestCheckTreesIsomorphic: def test_not_a_tree(self): with pytest.raises(TypeError, match="not a tree"): - check_isomorphic("s", 1) + check_isomorphic("s", 1) # type: ignore[arg-type] def test_different_widths(self): dt1 = DataTree.from_dict(d={"a": empty}) @@ -69,7 +73,7 @@ def test_not_isomorphic_complex_tree(self, create_test_datatree): def test_checking_from_root(self, create_test_datatree): dt1 = create_test_datatree() dt2 = create_test_datatree() - real_root = DataTree(name="real root") + real_root: DataTree = DataTree(name="real root") dt2.name = "not_real_root" dt2.parent = real_root with pytest.raises(TreeIsomorphismError): diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index df1ab1f40f9..26821c69495 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -27,6 +27,7 @@ timedelta_to_numeric, where, ) +from xarray.core.extension_array import PandasExtensionArray from xarray.namedarray.pycompat import array_type from xarray.testing import assert_allclose, assert_equal, assert_identical from xarray.tests import ( @@ -38,11 +39,55 @@ requires_bottleneck, requires_cftime, requires_dask, + requires_pyarrow, ) dask_array_type = array_type("dask") +@pytest.fixture +def categorical1(): + return pd.Categorical(["cat1", "cat2", "cat2", "cat1", "cat2"]) + + +@pytest.fixture +def categorical2(): + return pd.Categorical(["cat2", "cat1", "cat2", "cat3", "cat1"]) + + +try: + import pyarrow as pa + + @pytest.fixture + def arrow1(): + return pd.arrays.ArrowExtensionArray( + pa.array([{"x": 1, "y": True}, {"x": 2, "y": False}]) + ) + + @pytest.fixture + def arrow2(): + return pd.arrays.ArrowExtensionArray( + pa.array([{"x": 3, "y": False}, {"x": 4, "y": True}]) + ) + +except ImportError: + pass + + +@pytest.fixture +def int1(): + return pd.arrays.IntegerArray( + np.array([1, 2, 3, 4, 5]), np.array([True, False, False, True, True]) + ) + + +@pytest.fixture +def int2(): + return pd.arrays.IntegerArray( + np.array([6, 7, 8, 9, 10]), np.array([True, True, False, True, False]) + ) + + class TestOps: @pytest.fixture(autouse=True) def setUp(self): @@ -119,6 +164,51 @@ def test_where_type_promotion(self): assert result.dtype == np.float32 assert_array_equal(result, np.array([1, np.nan], dtype=np.float32)) + def test_where_extension_duck_array(self, categorical1, categorical2): + where_res = where( + np.array([True, False, True, False, False]), + PandasExtensionArray(categorical1), + PandasExtensionArray(categorical2), + ) + assert isinstance(where_res, PandasExtensionArray) + assert ( + where_res == pd.Categorical(["cat1", "cat1", "cat2", "cat3", "cat1"]) + ).all() + + def test_concatenate_extension_duck_array(self, categorical1, categorical2): + concate_res = concatenate( + [PandasExtensionArray(categorical1), PandasExtensionArray(categorical2)] + ) + assert isinstance(concate_res, PandasExtensionArray) + assert ( + concate_res + == type(categorical1)._concat_same_type((categorical1, categorical2)) + ).all() + + @requires_pyarrow + def test_duck_extension_array_pyarrow_concatenate(self, arrow1, arrow2): + concatenated = concatenate( + (PandasExtensionArray(arrow1), PandasExtensionArray(arrow2)) + ) + assert concatenated[2]["x"] == 3 + assert concatenated[3]["y"] + + def test___getitem__extension_duck_array(self, categorical1): + extension_duck_array = PandasExtensionArray(categorical1) + assert (extension_duck_array[0:2] == categorical1[0:2]).all() + assert isinstance(extension_duck_array[0:2], PandasExtensionArray) + assert extension_duck_array[0] == categorical1[0] + assert isinstance(extension_duck_array[0], PandasExtensionArray) + mask = [True, False, True, False, True] + assert (extension_duck_array[mask] == categorical1[mask]).all() + + def test__setitem__extension_duck_array(self, categorical1): + extension_duck_array = PandasExtensionArray(categorical1) + extension_duck_array[2] = "cat1" # already existing category + assert extension_duck_array[2] == "cat1" + with pytest.raises(TypeError, match="Cannot setitem on a Categorical"): + extension_duck_array[2] = "cat4" # new category + def test_stack_type_promotion(self): result = stack([1, "b"]) assert_array_equal(result, np.array([1, "b"], dtype=object)) @@ -932,3 +1022,21 @@ def test_push_dask(): dask.array.from_array(array, chunks=(1, 2, 3, 2, 2, 1, 1)), axis=0, n=n ) np.testing.assert_equal(actual, expected) + + +def test_duck_extension_array_equality(categorical1, int1): + int_duck_array = PandasExtensionArray(int1) + categorical_duck_array = PandasExtensionArray(categorical1) + assert (int_duck_array != categorical_duck_array).all() + assert (categorical_duck_array == categorical1).all() + assert (int_duck_array[0:2] == int1[0:2]).all() + + +def test_duck_extension_array_repr(int1): + int_duck_array = PandasExtensionArray(int1) + assert repr(int1) in repr(int_duck_array) + + +def test_duck_extension_array_attr(int1): + int_duck_array = PandasExtensionArray(int1) + assert (~int_duck_array.fillna(10)).all() diff --git a/xarray/tests/test_formatting_html.py b/xarray/tests/test_formatting_html.py index 6540406e914..ada7f75b21b 100644 --- a/xarray/tests/test_formatting_html.py +++ b/xarray/tests/test_formatting_html.py @@ -7,6 +7,7 @@ import xarray as xr from xarray.core import formatting_html as fh from xarray.core.coordinates import Coordinates +from xarray.core.datatree import DataTree @pytest.fixture @@ -196,3 +197,197 @@ def test_nonstr_variable_repr_html() -> None: html = v._repr_html_().strip() assert "
22 :
bar
" in html assert "
  • 10: 3
  • " in html + + +@pytest.fixture(scope="module", params=["some html", "some other html"]) +def repr(request): + return request.param + + +class Test_summarize_datatree_children: + """ + Unit tests for summarize_datatree_children. + """ + + func = staticmethod(fh.summarize_datatree_children) + + @pytest.fixture(scope="class") + def childfree_tree_factory(self): + """ + Fixture for a child-free DataTree factory. + """ + from random import randint + + def _childfree_tree_factory(): + return DataTree( + data=xr.Dataset({"z": ("y", [randint(1, 100) for _ in range(3)])}) + ) + + return _childfree_tree_factory + + @pytest.fixture(scope="class") + def childfree_tree(self, childfree_tree_factory): + """ + Fixture for a child-free DataTree. + """ + return childfree_tree_factory() + + @pytest.fixture(scope="function") + def mock_datatree_node_repr(self, monkeypatch): + """ + Apply mocking for datatree_node_repr. + """ + + def mock(group_title, dt): + """ + Mock with a simple result + """ + return group_title + " " + str(id(dt)) + + monkeypatch.setattr(fh, "datatree_node_repr", mock) + + @pytest.fixture(scope="function") + def mock_wrap_datatree_repr(self, monkeypatch): + """ + Apply mocking for _wrap_datatree_repr. + """ + + def mock(r, *, end, **kwargs): + """ + Mock by appending "end" or "not end". + """ + return r + " " + ("end" if end else "not end") + "//" + + monkeypatch.setattr(fh, "_wrap_datatree_repr", mock) + + def test_empty_mapping(self): + """ + Test with an empty mapping of children. + """ + children: dict[str, DataTree] = {} + assert self.func(children) == ( + "
    " + "
    " + ) + + def test_one_child( + self, childfree_tree, mock_wrap_datatree_repr, mock_datatree_node_repr + ): + """ + Test with one child. + + Uses a mock of _wrap_datatree_repr and _datatree_node_repr to essentially mock + the inline lambda function "lines_callback". + """ + # Create mapping of children + children = {"a": childfree_tree} + + # Expect first line to be produced from the first child, and + # wrapped as the last child + first_line = f"a {id(children['a'])} end//" + + assert self.func(children) == ( + "
    " + f"{first_line}" + "
    " + ) + + def test_two_children( + self, childfree_tree_factory, mock_wrap_datatree_repr, mock_datatree_node_repr + ): + """ + Test with two level deep children. + + Uses a mock of _wrap_datatree_repr and datatree_node_repr to essentially mock + the inline lambda function "lines_callback". + """ + + # Create mapping of children + children = {"a": childfree_tree_factory(), "b": childfree_tree_factory()} + + # Expect first line to be produced from the first child, and + # wrapped as _not_ the last child + first_line = f"a {id(children['a'])} not end//" + + # Expect second line to be produced from the second child, and + # wrapped as the last child + second_line = f"b {id(children['b'])} end//" + + assert self.func(children) == ( + "
    " + f"{first_line}" + f"{second_line}" + "
    " + ) + + +class Test__wrap_datatree_repr: + """ + Unit tests for _wrap_datatree_repr. + """ + + func = staticmethod(fh._wrap_datatree_repr) + + def test_end(self, repr): + """ + Test with end=True. + """ + r = self.func(repr, end=True) + assert r == ( + "
    " + "
    " + "
    " + "
    " + "
    " + "
    " + f"{repr}" + "
    " + "
    " + ) + + def test_not_end(self, repr): + """ + Test with end=False. + """ + r = self.func(repr, end=False) + assert r == ( + "
    " + "
    " + "
    " + "
    " + "
    " + "
    " + f"{repr}" + "
    " + "
    " + ) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 5adc41fcfdf..90f385e7621 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -36,6 +36,7 @@ def dataset() -> xr.Dataset: { "foo": (("x", "y", "z"), np.random.randn(3, 4, 2)), "baz": ("x", ["e", "f", "g"]), + "cat": ("y", pd.Categorical(["cat1", "cat2", "cat2", "cat1"])), }, {"x": ("x", ["a", "b", "c"], {"name": "x"}), "y": [1, 2, 3, 4], "z": [1, 2]}, ) @@ -80,6 +81,7 @@ def test_groupby_dims_property(dataset, recwarn) -> None: ) assert len(recwarn) == 0 + dataset = dataset.drop_vars(["cat"]) stacked = dataset.stack({"xy": ("x", "y")}) assert tuple(stacked.groupby("xy", squeeze=False).dims) == tuple( stacked.isel(xy=[0]).dims @@ -92,7 +94,7 @@ def test_groupby_sizes_property(dataset) -> None: assert dataset.groupby("x").sizes == dataset.isel(x=1).sizes with pytest.warns(UserWarning, match="The `squeeze` kwarg"): assert dataset.groupby("y").sizes == dataset.isel(y=1).sizes - + dataset = dataset.drop("cat") stacked = dataset.stack({"xy": ("x", "y")}) with pytest.warns(UserWarning, match="The `squeeze` kwarg"): assert stacked.groupby("xy").sizes == stacked.isel(xy=0).sizes @@ -762,6 +764,8 @@ def test_groupby_getitem(dataset) -> None: assert_identical(dataset.foo.sel(x="a"), dataset.foo.groupby("x")["a"]) with pytest.warns(UserWarning, match="The `squeeze` kwarg"): assert_identical(dataset.foo.sel(z=1), dataset.foo.groupby("z")[1]) + with pytest.warns(UserWarning, match="The `squeeze` kwarg"): + assert_identical(dataset.cat.sel(y=1), dataset.cat.groupby("y")[1]) assert_identical(dataset.sel(x=["a"]), dataset.groupby("x", squeeze=False)["a"]) assert_identical(dataset.sel(z=[1]), dataset.groupby("z", squeeze=False)[1]) @@ -771,6 +775,12 @@ def test_groupby_getitem(dataset) -> None: ) assert_identical(dataset.foo.sel(z=[1]), dataset.foo.groupby("z", squeeze=False)[1]) + assert_identical(dataset.cat.sel(y=[1]), dataset.cat.groupby("y", squeeze=False)[1]) + with pytest.raises( + NotImplementedError, match="Cannot broadcast 1d-only pandas categorical array." + ): + dataset.groupby("boo", squeeze=False) + dataset = dataset.drop_vars(["cat"]) actual = ( dataset.groupby("boo", squeeze=False)["f"].unstack().transpose("x", "y", "z") ) diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index c6597d5abb0..52935e9714e 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -37,7 +37,7 @@ def test_merge_arrays(self): assert_identical(actual, expected) def test_merge_datasets(self): - data = create_test_data(add_attrs=False) + data = create_test_data(add_attrs=False, use_extension_array=True) actual = xr.merge([data[["var1"]], data[["var2"]]]) expected = data[["var1", "var2"]] diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d9289aa6674..8a9345e74d4 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1576,6 +1576,20 @@ def test_transpose_0d(self): actual = variable.transpose() assert_identical(actual, variable) + def test_pandas_cateogrical_dtype(self): + data = pd.Categorical(np.arange(10, dtype="int64")) + v = self.cls("x", data) + print(v) # should not error + assert pd.api.types.is_extension_array_dtype(v.dtype) + + def test_pandas_cateogrical_no_chunk(self): + data = pd.Categorical(np.arange(10, dtype="int64")) + v = self.cls("x", data) + with pytest.raises( + ValueError, match=r".*was found to be a Pandas ExtensionArray.*" + ): + v.chunk((5,)) + def test_squeeze(self): v = Variable(["x", "y"], [[1]]) assert_identical(Variable([], 1), v.squeeze()) @@ -2373,6 +2387,11 @@ def test_multiindex(self): def test_pad(self, mode, xr_arg, np_arg): super().test_pad(mode, xr_arg, np_arg) + def test_pandas_cateogrical_dtype(self): + data = pd.Categorical(np.arange(10, dtype="int64")) + with pytest.raises(ValueError, match="was found to be a Pandas ExtensionArray"): + self.cls("x", data) + @requires_sparse class TestVariableWithSparse: