pydata · dcherian · Jun 11, 2024 · May 19, 2024 · May 18, 2024 · May 18, 2024
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -28,6 +28,8 @@ Performance
 
 - Small optimization to the netCDF4 and h5netcdf backends (:issue:`9058`, :pull:`9067`).
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- Small optimizations to help reduce indexing speed of datasets (:pull:`9002`).
+  By `Mark Harfouche <https://github.com/hmaarrfk>`_.
 
 
 Breaking changes
@@ -2903,7 +2905,7 @@ Bug fixes
   process (:issue:`4045`, :pull:`4684`). It also enables encoding and decoding standard
   calendar dates with time units of nanoseconds (:pull:`4400`).
   By `Spencer Clark <https://github.com/spencerkclark>`_ and `Mark Harfouche
-  <http://github.com/hmaarrfk>`_.
+  <https://github.com/hmaarrfk>`_.
 - :py:meth:`DataArray.astype`, :py:meth:`Dataset.astype` and :py:meth:`Variable.astype` support
   the ``order`` and ``subok`` parameters again. This fixes a regression introduced in version 0.16.1
   (:issue:`4644`, :pull:`4683`).

diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
@@ -575,13 +575,24 @@ class PandasIndex(Index):
 
     __slots__ = ("index", "dim", "coord_dtype")
 
-    def __init__(self, array: Any, dim: Hashable, coord_dtype: Any = None):
-        # make a shallow copy: cheap and because the index name may be updated
-        # here or in other constructors (cannot use pd.Index.rename as this
-        # constructor is also called from PandasMultiIndex)
-        index = safe_cast_to_index(array).copy()
+    def __init__(
+        self,
+        array: Any,
+        dim: Hashable,
+        coord_dtype: Any = None,
+        *,
+        fastpath: bool = False,
+    ):
+        if fastpath:
+            index = array
+        else:
+            index = safe_cast_to_index(array)
 
         if index.name is None:
+            # make a shallow copy: cheap and because the index name may be updated
+            # here or in other constructors (cannot use pd.Index.rename as this
+            # constructor is also called from PandasMultiIndex)
+            index = index.copy()
             index.name = dim
 
         self.index = index
@@ -596,7 +607,7 @@ def _replace(self, index, dim=None, coord_dtype=None):
             dim = self.dim
         if coord_dtype is None:
             coord_dtype = self.coord_dtype
-        return type(self)(index, dim, coord_dtype)
+        return type(self)(index, dim, coord_dtype, fastpath=True)
 
     @classmethod
     def from_variables(
@@ -642,6 +653,11 @@ def from_variables(
 
         obj = cls(data, dim, coord_dtype=var.dtype)
         assert not isinstance(obj.index, pd.MultiIndex)
+        # Rename safely
+        # make a shallow copy: cheap and because the index name may be updated
+        # here or in other constructors (cannot use pd.Index.rename as this
+        # constructor is also called from PandasMultiIndex)
+        obj.index = obj.index.copy()
         obj.index.name = name
 
         return obj
@@ -1773,6 +1789,36 @@ def check_variables():
     return not not_equal
 
 
+def _apply_indexes_fast(indexes: Indexes[Index], args: Mapping[Any, Any], func: str):
+    # This function avoids the call to indexes.group_by_index
+    # which is really slow when repeatidly iterating through
+    # an array. However, it fails to return the correct ID for
+    # multi-index arrays
+    indexes_fast, coords = indexes._indexes, indexes._variables
+
+    new_indexes: dict[Hashable, Index] = {k: v for k, v in indexes_fast.items()}
+    new_index_variables: dict[Hashable, Variable] = {}
+    for name, index in indexes_fast.items():
+        coord = coords[name]
+        if hasattr(coord, "_indexes"):
+            index_vars = {n: coords[n] for n in coord._indexes}
+        else:
+            index_vars = {name: coord}
+        index_dims = {d for var in index_vars.values() for d in var.dims}
+        index_args = {k: v for k, v in args.items() if k in index_dims}
+
+        if index_args:
+            new_index = getattr(index, func)(index_args)
+            if new_index is not None:
+                new_indexes.update({k: new_index for k in index_vars})
+                new_index_vars = new_index.create_variables(index_vars)
+                new_index_variables.update(new_index_vars)
+            else:
+                for k in index_vars:
+                    new_indexes.pop(k, None)
+    return new_indexes, new_index_variables
+
+
 def _apply_indexes(
     indexes: Indexes[Index],
     args: Mapping[Any, Any],
@@ -1801,7 +1847,13 @@ def isel_indexes(
     indexes: Indexes[Index],
     indexers: Mapping[Any, Any],
 ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]:
-    return _apply_indexes(indexes, indexers, "isel")
+    # TODO: remove if clause in the future. It should be unnecessary.
+    # See failure introduced when removed
+    # https://github.com/pydata/xarray/pull/9002#discussion_r1590443756
+    if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()):
+        return _apply_indexes(indexes, indexers, "isel")
+    else:
+        return _apply_indexes_fast(indexes, indexers, "isel")
 
 
 def roll_indexes(

diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -296,15 +296,16 @@ def slice_slice(old_slice: slice, applied_slice: slice, size: int) -> slice:
 
 
 def _index_indexer_1d(old_indexer, applied_indexer, size: int):
-    assert isinstance(applied_indexer, integer_types + (slice, np.ndarray))
     if isinstance(applied_indexer, slice) and applied_indexer == slice(None):
         # shortcut for the usual case
         return old_indexer
     if isinstance(old_indexer, slice):
         if isinstance(applied_indexer, slice):
             indexer = slice_slice(old_indexer, applied_indexer, size)
+        elif isinstance(applied_indexer, integer_types):
+            indexer = range(*old_indexer.indices(size))[applied_indexer]  # type: ignore[assignment]
         else:
-            indexer = _expand_slice(old_indexer, size)[applied_indexer]  # type: ignore[assignment]
+            indexer = _expand_slice(old_indexer, size)[applied_indexer]
     else:
         indexer = old_indexer[applied_indexer]
     return indexer
@@ -591,7 +592,7 @@ def __getitem__(self, key: Any):
 class LazilyIndexedArray(ExplicitlyIndexedNDArrayMixin):
     """Wrap an array to make basic and outer indexing lazy."""
 
-    __slots__ = ("array", "key")
+    __slots__ = ("array", "key", "_shape")
 
     def __init__(self, array: Any, key: ExplicitIndexer | None = None):
         """
@@ -614,6 +615,14 @@ def __init__(self, array: Any, key: ExplicitIndexer | None = None):
         self.array = as_indexable(array)
         self.key = key
 
+        shape: _Shape = ()
+        for size, k in zip(self.array.shape, self.key.tuple):
+            if isinstance(k, slice):
+                shape += (len(range(*k.indices(size))),)
+            elif isinstance(k, np.ndarray):
+                shape += (k.size,)
+        self._shape = shape
+
     def _updated_key(self, new_key: ExplicitIndexer) -> BasicIndexer | OuterIndexer:
         iter_new_key = iter(expanded_indexer(new_key.tuple, self.ndim))
         full_key = []
@@ -630,13 +639,7 @@ def _updated_key(self, new_key: ExplicitIndexer) -> BasicIndexer | OuterIndexer:
 
     @property
     def shape(self) -> _Shape:
-        shape = []
-        for size, k in zip(self.array.shape, self.key.tuple):
-            if isinstance(k, slice):
-                shape.append(len(range(*k.indices(size))))
-            elif isinstance(k, np.ndarray):
-                shape.append(k.size)
-        return tuple(shape)
+        return self._shape
 
     def get_duck_array(self):
         if isinstance(self.array, ExplicitlyIndexedNDArrayMixin):