xarray-contrib · dcherian · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022
diff --git a/flox/core.py b/flox/core.py
@@ -446,7 +446,12 @@ def factorize_(
     for groupvar, expect in zip(by, expected_groups):
         flat = groupvar.reshape(-1)
         if isinstance(expect, pd.RangeIndex):
-            idx = flat
+            # idx is a view of the original `by` aray
+            # copy here so we don't have a race condition with the
+            # group_idx[nanmask] = nan_sentinel assignment later
+            # this is important in shared-memory parallelism with dask
+            # TODO: figure out how to avoid this
+            idx = flat.copy()
             found_groups.append(np.array(expect))
             # TODO: fix by using masked integers
             idx[idx > expect[-1]] = -1

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1282,3 +1282,30 @@ def test_1d_blockwise_sort_optimization():
         array, time.dt.dayofyear.values[::-1], sort=False, method="blockwise", func="count"
     )
     assert all("getitem" not in k for k in actual.dask.layers)
+
+
+@requires_dask
+def test_negative_index_factorize_race_condition():
+    # shape = (10, 2000)
+    # chunks = ((shape[0]-1,1), 10)
+    shape = (101, 174000)
+    chunks = ((101,), 8760)
+    eps = dask.array.random.random_sample(shape, chunks=chunks)
+    N2 = dask.array.random.random_sample(shape, chunks=chunks)
+    S2 = dask.array.random.random_sample(shape, chunks=chunks)
+
+    bins = np.arange(-5, -2.05, 0.1)
+    func = ["mean", "count", "sum"]
+
+    out = [
+        groupby_reduce(
+            eps,
+            N2,
+            S2,
+            func=f,
+            expected_groups=(bins, bins),
+            isbin=(True, True),
+        )
+        for f in func
+    ]
+    [dask.compute(out, scheduler="threads") for _ in range(5)]