From 6f2a6d72509a6e92093111636b61827cd7268fd3 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 12 Dec 2022 11:38:59 -0800 Subject: [PATCH 01/42] FEAT-#5423: Begin implementing NumPy API Layer Signed-off-by: Rehan Durrani From 7a4fa99d3c32821edb9c3b758e018485573b9d82 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 11 Nov 2022 18:08:04 -0600 Subject: [PATCH 02/42] Start Signed-off-by: Devin Petersohn --- modin/numpy/__init__.py | 0 modin/numpy/arr.py | 13 +++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 modin/numpy/__init__.py create mode 100644 modin/numpy/arr.py diff --git a/modin/numpy/__init__.py b/modin/numpy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py new file mode 100644 index 00000000000..8e2421b397d --- /dev/null +++ b/modin/numpy/arr.py @@ -0,0 +1,13 @@ +import numpy + +class array(object): + + def __init__(self, object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0, like=None, query_compiler=None): + if query_compiler is not None: + pass + arr = numpy.array(object, dtype=dtype, copy=copy, order=order, subok=subok, ndim=ndim, like=like) + pass + + def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): + return self._query_compiler.sum(axis=axis) + From 2a08cf0c5da0d238768728e85d68eaf2118bbce5 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 11 Nov 2022 18:35:14 -0600 Subject: [PATCH 03/42] Next Signed-off-by: Devin Petersohn --- modin/numpy/arr.py | 13 +++++++++---- modin/numpy/math.py | 8 ++++++++ modin/pandas/base.py | 8 +++----- 3 files changed, 20 insertions(+), 9 deletions(-) create mode 100644 modin/numpy/math.py diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 8e2421b397d..ed83737cdd8 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -2,12 +2,17 @@ class array(object): - def __init__(self, object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0, like=None, query_compiler=None): + def __init__(self, object=None, dtype=None, *, copy=True, order='K', subok=False, ndmin=0, like=None, query_compiler=None): if query_compiler is not None: + self._query_compiler = query_compiler + else: + arr = numpy.array(object, dtype=dtype, copy=copy, order=order, subok=subok, ndmin=ndmin, like=like) pass - arr = numpy.array(object, dtype=dtype, copy=copy, order=order, subok=subok, ndim=ndim, like=like) - pass def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): - return self._query_compiler.sum(axis=axis) + result = self._query_compiler.sum(axis=axis) + if dtype is not None: + result.astype(dtype) + def __repr__(self): + return repr(self._query_compiler.to_numpy()) diff --git a/modin/numpy/math.py b/modin/numpy/math.py new file mode 100644 index 00000000000..525ef71b989 --- /dev/null +++ b/modin/numpy/math.py @@ -0,0 +1,8 @@ +import numpy + + +def sum(arr, axis): + if hasattr(arr, "_sum"): + return arr._sum(axis) + else: + return numpy.sum(arr) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 0470182cb60..7289b315bbb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3238,11 +3238,9 @@ def to_numpy( """ Convert the `BasePandasDataset` to a NumPy array. """ - return self._query_compiler.to_numpy( - dtype=dtype, - copy=copy, - na_value=na_value, - ) + from ..numpy.arr import array + + return array(query_compiler=self._query_compiler) # TODO(williamma12): When this gets implemented, have the series one call this. def to_period( From 4b68f50f1300dedfd5b5208aba8de9ae266b0b0f Mon Sep 17 00:00:00 2001 From: Bill Wang Date: Fri, 11 Nov 2022 17:15:10 -0800 Subject: [PATCH 04/42] Added absolute, abs, add, all, subtract to modin.numpy Signed-off-by: Bill Wang --- modin/numpy/__init__.py | 2 ++ modin/numpy/arr.py | 16 ++++++++++++++++ modin/numpy/math.py | 25 +++++++++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/modin/numpy/__init__.py b/modin/numpy/__init__.py index e69de29bb2d..550e1f13c3d 100644 --- a/modin/numpy/__init__.py +++ b/modin/numpy/__init__.py @@ -0,0 +1,2 @@ +from .arr import * +from .math import * \ No newline at end of file diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index ed83737cdd8..5f4af4f2013 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -9,6 +9,22 @@ def __init__(self, object=None, dtype=None, *, copy=True, order='K', subok=False arr = numpy.array(object, dtype=dtype, copy=copy, order=order, subok=subok, ndmin=ndmin, like=like) pass + def _absolute(self, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.abs().to_numpy() + return result + + def _add(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.add(x2._query_compiler).to_numpy() + return result + + def _all(self, axis=None, out=None, keepdims=None, where=None): + result = self._query_compiler.all(axis=axis).to_numpy() + return result + + def _subtract(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.sub(x2._query_compiler).to_numpy() + return result + def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): result = self._query_compiler.sum(axis=axis) if dtype is not None: diff --git a/modin/numpy/math.py b/modin/numpy/math.py index 525ef71b989..b1b9a71b093 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -1,5 +1,30 @@ import numpy +def absolute(x, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x, "_absolute"): + return x._absolute(out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.absolute(x, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + +abs = absolute + + +def add(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_add"): + return x1._add(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.add(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + + +def all(a, axis=None, out=None, keepdims=None, where=None): + if hasattr(a, "_all"): + return a._all(axis=axis, out=out, keepdims=keepdims, where=where) + return numpy.all(a, axis=axis, out=out, keepdims=keepdims, where=where) + + +def subtract(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_subtract"): + return x1._subtract(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.subtract(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + def sum(arr, axis): if hasattr(arr, "_sum"): From 0b915b4ceb9f2dc1352f930900dbb163bc32e576 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 22 Nov 2022 13:54:58 -0600 Subject: [PATCH 05/42] Add changes Signed-off-by: Devin Petersohn --- modin/numpy/arr.py | 6 +++++- modin/numpy/math.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 5f4af4f2013..ec3fc95c2ec 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -28,7 +28,11 @@ def _subtract(self, x2, out=None, where=True, casting='same_kind', order='K', dt def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): result = self._query_compiler.sum(axis=axis) if dtype is not None: - result.astype(dtype) + result = result.astype(dtype) + if out is not None: + out._query_compiler = result + return + return array(query_compiler=result) def __repr__(self): return repr(self._query_compiler.to_numpy()) diff --git a/modin/numpy/math.py b/modin/numpy/math.py index b1b9a71b093..15f0e82a6bc 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -26,7 +26,7 @@ def subtract(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype return numpy.subtract(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) -def sum(arr, axis): +def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): if hasattr(arr, "_sum"): return arr._sum(axis) else: From 9c7a66b86d79d5de527b8b6ebf58078b7c674e9e Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Fri, 11 Nov 2022 20:13:04 -0800 Subject: [PATCH 06/42] Add shape + reshape Signed-off-by: Rehan Durrani --- modin/numpy/__init__.py | 3 ++- modin/numpy/arr.py | 25 +++++++++++++++++++++++++ modin/numpy/constants.py | 17 +++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 modin/numpy/constants.py diff --git a/modin/numpy/__init__.py b/modin/numpy/__init__.py index 550e1f13c3d..8c79d10ade8 100644 --- a/modin/numpy/__init__.py +++ b/modin/numpy/__init__.py @@ -1,2 +1,3 @@ from .arr import * -from .math import * \ No newline at end of file +from .math import * +from .constants import * diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index ec3fc95c2ec..655a2aaecfe 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -34,5 +34,30 @@ def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, whe return return array(query_compiler=result) + def _get_shape(self): + return (len(self._query_compiler.index), len(self._query_compiler.columns)) + + def _set_shape(self, new_shape): + if not (isinstance(new_shape, int)) and not isinstance(new_shape, tuple): + raise TypeError(f"expected a sequence of integers or a single integer, got '{new_shape}'") + elif isinstance(new_shape, tuple): + for dim in new_shape: + if not isinstance(dim, int): + raise TypeError(f"'{type(dim)}' object cannot be interpreted as an integer") + from math import prod + new_dimensions = new_shape if isinstance(new_shape, int) else prod(new_shape) + if new_dimensions != prod(self._get_shape()): + raise ValueError(f"cannot reshape array of size {prod(self._get_shape)} into {new_shape if isinstance(new_shape, tuple) else (new_shape,)}") + if isinstance(new_shape, int): + qcs = [] + for index_val in self._query_compiler.index[1:]: + qcs.append(self._query_compiler.getitem_row_array([index_val]).reset_index(drop=True)) + self._query_compiler = self._query_compiler.getitem_row_array([self._query_compiler.index[0]]).reset_index(drop=True).concat(1, qcs, ignore_index=True) + else: + raise NotImplementedError("Reshaping from a 2D object to a 2D object is not currently supported!") + + shape = property(_get_shape, _set_shape) + + def __repr__(self): return repr(self._query_compiler.to_numpy()) diff --git a/modin/numpy/constants.py b/modin/numpy/constants.py new file mode 100644 index 00000000000..8d720e9b3af --- /dev/null +++ b/modin/numpy/constants.py @@ -0,0 +1,17 @@ +from numpy import ( + Inf, + Infinity, + NAN, + NINF, + NZERO, + NaN, + PINF, + PZERO, + e, + euler_gamma, + inf, + infty, + nan, + newaxis, + pi +) From 1c6d708c2be00ade058e857b9b75c66b61af4d36 Mon Sep 17 00:00:00 2001 From: Bill Wang Date: Fri, 11 Nov 2022 20:19:21 -0800 Subject: [PATCH 07/42] Added additional math functions for numpy Signed-off-by: Bill Wang --- modin/numpy/arr.py | 47 ++++++++++++++++++++++++++++++++++--------- modin/numpy/math.py | 49 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 9 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 655a2aaecfe..aa0a5125f40 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -10,20 +10,49 @@ def __init__(self, object=None, dtype=None, *, copy=True, order='K', subok=False pass def _absolute(self, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): - result = self._query_compiler.abs().to_numpy() - return result + result = self._query_compiler.abs() + return array(query_compiler=result) def _add(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): - result = self._query_compiler.add(x2._query_compiler).to_numpy() - return result + result = self._query_compiler.add(x2._query_compiler) + return array(query_compiler=result) + + def _divide(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.truediv(x2._query_compiler) + return array(query_compiler=result) + + def _float_power(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.add(x2._query_compiler) + return array(query_compiler=result) + + def _floor_divide(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.floordiv(x2._query_compiler) + return array(query_compiler=result) - def _all(self, axis=None, out=None, keepdims=None, where=None): - result = self._query_compiler.all(axis=axis).to_numpy() - return result + def _power(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.pow(x2._query_compiler) + return array(query_compiler=result) + + def _prod(self, axis=None, out=None, keepdims=None, where=None): + print("Series?", self._query_compiler.is_series_like()) + if axis is None: + result = self._query_compiler.prod(axis=0).prod(axis=1) + return array(query_compiler=result) + else: + result = self._query_compiler.prod(axis=axis) + return array(query_compiler=result) + + def _multiply(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.mul(x2._query_compiler) + return array(query_compiler=result) + + def _remainder(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + result = self._query_compiler.mod(x2._query_compiler) + return array(query_compiler=result) def _subtract(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): - result = self._query_compiler.sub(x2._query_compiler).to_numpy() - return result + result = self._query_compiler.sub(x2._query_compiler) + return array(query_compiler=result) def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): result = self._query_compiler.sum(axis=axis) diff --git a/modin/numpy/math.py b/modin/numpy/math.py index 15f0e82a6bc..5b73d0a449a 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -20,6 +20,50 @@ def all(a, axis=None, out=None, keepdims=None, where=None): return numpy.all(a, axis=axis, out=out, keepdims=keepdims, where=where) +def divide(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_divide"): + return x1._divide(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.divide(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + + +def float_power(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_float_power"): + return x1._float_power(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.float_power(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + + +def floor_divide(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_floor_divide"): + return x1._floor_divide(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.floor_divide(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + + +def power(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_power"): + return x1._power(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.power(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + + +def prod(a, axis=None, out=None, keepdims=None, where=None): + if hasattr(a, "_prod"): + return a._prod(axis=axis, out=out, keepdims=keepdims, where=where) + return numpy.prod(a, axis=axis, out=out, keepdims=keepdims, where=where) + + +def multiply(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_multiply"): + return x1._multiply(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.multiply(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + + +def remainder(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_remainder"): + return x1._remainder(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.remainder(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + +mod = remainder + + def subtract(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): if hasattr(x1, "_subtract"): return x1._subtract(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) @@ -31,3 +75,8 @@ def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where return arr._sum(axis) else: return numpy.sum(arr) + +def true_divide(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + if hasattr(x1, "_divide"): + return x1._divide(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return numpy.divide(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) From 30171d20e70aaf10ca12c1c0d65b32e4750e248c Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 15 Nov 2022 13:00:35 -0800 Subject: [PATCH 08/42] Add list constructor Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index aa0a5125f40..00f115b22f1 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -5,6 +5,10 @@ class array(object): def __init__(self, object=None, dtype=None, *, copy=True, order='K', subok=False, ndmin=0, like=None, query_compiler=None): if query_compiler is not None: self._query_compiler = query_compiler + elif isinstance(object, list): + import modin.pandas as pd + qc = pd.DataFrame(object)._query_compiler + self._query_compiler = qc else: arr = numpy.array(object, dtype=dtype, copy=copy, order=order, subok=subok, ndmin=ndmin, like=like) pass From ab0ecdb87e326411f7e6911e08d7733e136b481d Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 12 Dec 2022 11:47:25 -0800 Subject: [PATCH 09/42] lint Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 164 ++++++++++++++++++++++---- modin/numpy/constants.py | 2 +- modin/numpy/math.py | 241 ++++++++++++++++++++++++++++++++++----- 3 files changed, 353 insertions(+), 54 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 00f115b22f1..133e9a3dbb0 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -1,39 +1,112 @@ import numpy -class array(object): - def __init__(self, object=None, dtype=None, *, copy=True, order='K', subok=False, ndmin=0, like=None, query_compiler=None): +class array(object): + def __init__( + self, + object=None, + dtype=None, + *, + copy=True, + order="K", + subok=False, + ndmin=0, + like=None, + query_compiler=None, + ): if query_compiler is not None: self._query_compiler = query_compiler elif isinstance(object, list): import modin.pandas as pd + qc = pd.DataFrame(object)._query_compiler self._query_compiler = qc else: - arr = numpy.array(object, dtype=dtype, copy=copy, order=order, subok=subok, ndmin=ndmin, like=like) + arr = numpy.array( + object, + dtype=dtype, + copy=copy, + order=order, + subok=subok, + ndmin=ndmin, + like=like, + ) pass - def _absolute(self, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _absolute( + self, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.abs() return array(query_compiler=result) - def _add(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _add( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.add(x2._query_compiler) return array(query_compiler=result) - def _divide(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _divide( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.truediv(x2._query_compiler) return array(query_compiler=result) - def _float_power(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _float_power( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.add(x2._query_compiler) return array(query_compiler=result) - def _floor_divide(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _floor_divide( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.floordiv(x2._query_compiler) return array(query_compiler=result) - def _power(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _power( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.pow(x2._query_compiler) return array(query_compiler=result) @@ -46,19 +119,48 @@ def _prod(self, axis=None, out=None, keepdims=None, where=None): result = self._query_compiler.prod(axis=axis) return array(query_compiler=result) - def _multiply(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _multiply( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.mul(x2._query_compiler) return array(query_compiler=result) - def _remainder(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _remainder( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.mod(x2._query_compiler) return array(query_compiler=result) - def _subtract(self, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + def _subtract( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): result = self._query_compiler.sub(x2._query_compiler) return array(query_compiler=result) - def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): + def _sum( + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + ): result = self._query_compiler.sum(axis=axis) if dtype is not None: result = result.astype(dtype) @@ -68,29 +170,45 @@ def _sum(self, axis=None, dtype=None, out=None, keepdims=None, initial=None, whe return array(query_compiler=result) def _get_shape(self): - return (len(self._query_compiler.index), len(self._query_compiler.columns)) - + return (len(self._query_compiler.index), len(self._query_compiler.columns)) + def _set_shape(self, new_shape): if not (isinstance(new_shape, int)) and not isinstance(new_shape, tuple): - raise TypeError(f"expected a sequence of integers or a single integer, got '{new_shape}'") + raise TypeError( + f"expected a sequence of integers or a single integer, got '{new_shape}'" + ) elif isinstance(new_shape, tuple): for dim in new_shape: if not isinstance(dim, int): - raise TypeError(f"'{type(dim)}' object cannot be interpreted as an integer") + raise TypeError( + f"'{type(dim)}' object cannot be interpreted as an integer" + ) from math import prod + new_dimensions = new_shape if isinstance(new_shape, int) else prod(new_shape) if new_dimensions != prod(self._get_shape()): - raise ValueError(f"cannot reshape array of size {prod(self._get_shape)} into {new_shape if isinstance(new_shape, tuple) else (new_shape,)}") + raise ValueError( + f"cannot reshape array of size {prod(self._get_shape)} into {new_shape if isinstance(new_shape, tuple) else (new_shape,)}" + ) if isinstance(new_shape, int): qcs = [] for index_val in self._query_compiler.index[1:]: - qcs.append(self._query_compiler.getitem_row_array([index_val]).reset_index(drop=True)) - self._query_compiler = self._query_compiler.getitem_row_array([self._query_compiler.index[0]]).reset_index(drop=True).concat(1, qcs, ignore_index=True) + qcs.append( + self._query_compiler.getitem_row_array([index_val]).reset_index( + drop=True + ) + ) + self._query_compiler = ( + self._query_compiler.getitem_row_array([self._query_compiler.index[0]]) + .reset_index(drop=True) + .concat(1, qcs, ignore_index=True) + ) else: - raise NotImplementedError("Reshaping from a 2D object to a 2D object is not currently supported!") - + raise NotImplementedError( + "Reshaping from a 2D object to a 2D object is not currently supported!" + ) + shape = property(_get_shape, _set_shape) - def __repr__(self): return repr(self._query_compiler.to_numpy()) diff --git a/modin/numpy/constants.py b/modin/numpy/constants.py index 8d720e9b3af..a24eba02a25 100644 --- a/modin/numpy/constants.py +++ b/modin/numpy/constants.py @@ -13,5 +13,5 @@ infty, nan, newaxis, - pi + pi, ) diff --git a/modin/numpy/math.py b/modin/numpy/math.py index 5b73d0a449a..202288a96a6 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -1,17 +1,44 @@ import numpy -def absolute(x, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + +def absolute( + x, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x, "_absolute"): - return x._absolute(out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.absolute(x, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x._absolute( + out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok + ) + return numpy.absolute( + x, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok + ) + abs = absolute -def add(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def add( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_add"): - return x1._add(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.add(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._add( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.add( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def all(a, axis=None, out=None, keepdims=None, where=None): @@ -20,28 +47,104 @@ def all(a, axis=None, out=None, keepdims=None, where=None): return numpy.all(a, axis=axis, out=out, keepdims=keepdims, where=where) -def divide(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def divide( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_divide"): - return x1._divide(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.divide(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._divide( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.divide( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) -def float_power(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def float_power( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_float_power"): - return x1._float_power(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.float_power(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._float_power( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.float_power( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) -def floor_divide(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def floor_divide( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_floor_divide"): - return x1._floor_divide(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.floor_divide(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._floor_divide( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.floor_divide( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) -def power(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def power( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_power"): - return x1._power(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.power(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._power( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.power( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def prod(a, axis=None, out=None, keepdims=None, where=None): @@ -50,24 +153,82 @@ def prod(a, axis=None, out=None, keepdims=None, where=None): return numpy.prod(a, axis=axis, out=out, keepdims=keepdims, where=where) -def multiply(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def multiply( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_multiply"): - return x1._multiply(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.multiply(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._multiply( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.multiply( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) -def remainder(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def remainder( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_remainder"): - return x1._remainder(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.remainder(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._remainder( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.remainder( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + mod = remainder -def subtract(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): +def subtract( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_subtract"): - return x1._subtract(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.subtract(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._subtract( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.subtract( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): @@ -76,7 +237,27 @@ def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where else: return numpy.sum(arr) -def true_divide(x1, x2, out=None, where=True, casting='same_kind', order='K', dtype=None, subok=True): + +def true_divide( + x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True +): if hasattr(x1, "_divide"): - return x1._divide(x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) - return numpy.divide(x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok) + return x1._divide( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return numpy.divide( + x1, + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) From 25510bc095dc9fce71cff843cbafacd1cc2f2f74 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 11 Jan 2023 19:15:40 -0800 Subject: [PATCH 10/42] Add dimension handling Signed-off-by: Rehan Durrani --- modin/config/envvars.py | 5 +++ modin/numpy/__init__.py | 13 +++++++ modin/numpy/arr.py | 78 +++++++++++++++++++++++++++------------- modin/numpy/constants.py | 13 +++++++ modin/numpy/math.py | 13 +++++++ modin/pandas/base.py | 12 +++++-- modin/pandas/series.py | 22 +++++++----- 7 files changed, 121 insertions(+), 35 deletions(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index d16a463d34f..28c23214e1a 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -627,6 +627,11 @@ class TestReadFromPostgres(EnvironmentVariable, type=bool): varname = "MODIN_TEST_READ_FROM_POSTGRES" default = False +class ExperimentalNumPyAPI(EnvironmentVariable, type=bool): + """Set to true to use Modin's experimental NumPy API.""" + + varname = "MODIN_EXPERIMENTAL_NUMPY_API" + default = False class ReadSqlEngine(EnvironmentVariable, type=str): """Engine to run `read_sql`.""" diff --git a/modin/numpy/__init__.py b/modin/numpy/__init__.py index 8c79d10ade8..903ae6b9948 100644 --- a/modin/numpy/__init__.py +++ b/modin/numpy/__init__.py @@ -1,3 +1,16 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + from .arr import * from .math import * from .constants import * diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 133e9a3dbb0..c588d9df3c6 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -1,5 +1,20 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + import numpy +from pandas.core.dtypes.common import is_list_like +from modin.error_message import ErrorMessage class array(object): def __init__( @@ -12,26 +27,33 @@ def __init__( subok=False, ndmin=0, like=None, - query_compiler=None, + _query_compiler=None, + _ndim=None, ): - if query_compiler is not None: - self._query_compiler = query_compiler - elif isinstance(object, list): + if _query_compiler is not None: + self._query_compiler = _query_compiler + self._ndim = _ndim + elif is_list_like(object) and not is_list_like(object[0]): import modin.pandas as pd qc = pd.DataFrame(object)._query_compiler self._query_compiler = qc + self._ndim = 1 else: + expected_kwargs = {"dtype":None, "copy":True,"order":"K","subok":False,"ndmin":0,"like":None} + rcvd_kwargs = {"dtype":dtype, "copy":copy,"order":order,"subok":subok,"ndmin":ndmin,"like":like} + for key, value in rcvd_kwargs.copy().items(): + if value == expected_kwargs[key]: + rcvd_kwargs.pop(key) arr = numpy.array( object, - dtype=dtype, - copy=copy, - order=order, - subok=subok, - ndmin=ndmin, - like=like, + **rcvd_kwargs ) - pass + self._ndim = len(arr.shape) + if self._ndim > 2: + ErrorMessage.not_implemented("NumPy arrays with dimensions higher than 2 are not yet supported.") + import modin.pandas as pd + self._query_compiler = pd.DataFrame(object)._query_compiler def _absolute( self, @@ -43,7 +65,7 @@ def _absolute( subok=True, ): result = self._query_compiler.abs() - return array(query_compiler=result) + return array(_query_compiler=result) def _add( self, @@ -55,8 +77,9 @@ def _add( dtype=None, subok=True, ): - result = self._query_compiler.add(x2._query_compiler) - return array(query_compiler=result) + broadcast = self._ndim != x2._ndim + result = self._query_compiler.add(x2._query_compiler, broadcast=broadcast) + return array(_query_compiler=result) def _divide( self, @@ -69,7 +92,7 @@ def _divide( subok=True, ): result = self._query_compiler.truediv(x2._query_compiler) - return array(query_compiler=result) + return array(_query_compiler=result) def _float_power( self, @@ -82,7 +105,7 @@ def _float_power( subok=True, ): result = self._query_compiler.add(x2._query_compiler) - return array(query_compiler=result) + return array(_query_compiler=result) def _floor_divide( self, @@ -95,7 +118,7 @@ def _floor_divide( subok=True, ): result = self._query_compiler.floordiv(x2._query_compiler) - return array(query_compiler=result) + return array(_query_compiler=result) def _power( self, @@ -108,16 +131,16 @@ def _power( subok=True, ): result = self._query_compiler.pow(x2._query_compiler) - return array(query_compiler=result) + return array(_query_compiler=result) def _prod(self, axis=None, out=None, keepdims=None, where=None): print("Series?", self._query_compiler.is_series_like()) if axis is None: result = self._query_compiler.prod(axis=0).prod(axis=1) - return array(query_compiler=result) + return array(_query_compiler=result) else: result = self._query_compiler.prod(axis=axis) - return array(query_compiler=result) + return array(_query_compiler=result) def _multiply( self, @@ -130,7 +153,7 @@ def _multiply( subok=True, ): result = self._query_compiler.mul(x2._query_compiler) - return array(query_compiler=result) + return array(_query_compiler=result) def _remainder( self, @@ -143,7 +166,7 @@ def _remainder( subok=True, ): result = self._query_compiler.mod(x2._query_compiler) - return array(query_compiler=result) + return array(_query_compiler=result) def _subtract( self, @@ -156,7 +179,7 @@ def _subtract( subok=True, ): result = self._query_compiler.sub(x2._query_compiler) - return array(query_compiler=result) + return array(_query_compiler=result) def _sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None @@ -167,9 +190,11 @@ def _sum( if out is not None: out._query_compiler = result return - return array(query_compiler=result) + return array(_query_compiler=result) def _get_shape(self): + if self._ndim == 1: + return (len(self._query_compiler.columns),) return (len(self._query_compiler.index), len(self._query_compiler.columns)) def _set_shape(self, new_shape): @@ -211,4 +236,7 @@ def _set_shape(self, new_shape): shape = property(_get_shape, _set_shape) def __repr__(self): - return repr(self._query_compiler.to_numpy()) + arr = self._query_compiler.to_numpy() + if self._ndim == 1: + arr = arr.flatten() + return repr(arr) diff --git a/modin/numpy/constants.py b/modin/numpy/constants.py index a24eba02a25..794a003754b 100644 --- a/modin/numpy/constants.py +++ b/modin/numpy/constants.py @@ -1,3 +1,16 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + from numpy import ( Inf, Infinity, diff --git a/modin/numpy/math.py b/modin/numpy/math.py index 202288a96a6..bf6061eca80 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -1,3 +1,16 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + import numpy diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 7289b315bbb..f3538d8f7b0 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3238,9 +3238,17 @@ def to_numpy( """ Convert the `BasePandasDataset` to a NumPy array. """ - from ..numpy.arr import array + from modin.config import ExperimentalNumPyAPI + if ExperimentalNumPyAPI.get(): + from ..numpy.arr import array - return array(query_compiler=self._query_compiler) + return array(_query_compiler=self._query_compiler, _ndim=2) + + return self._query_compiler.to_numpy( + dtype=dtype, + copy=copy, + na_value=na_value, + ) # TODO(williamma12): When this gets implemented, have the series one call this. def to_period( diff --git a/modin/pandas/series.py b/modin/pandas/series.py index e93a91e0b59..d0604cbd25a 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1925,15 +1925,21 @@ def to_numpy( """ Return the NumPy ndarray representing the values in this Series or Index. """ - return ( - super(Series, self) - .to_numpy( - dtype=dtype, - copy=copy, - na_value=na_value, + from modin.config import ExperimentalNumPyAPI + if not ExperimentalNumPyAPI.get(): + return ( + super(Series, self) + .to_numpy( + dtype=dtype, + copy=copy, + na_value=na_value, + ) + .flatten() ) - .flatten() - ) + else: + from ..numpy.arr import array + + return array(_query_compiler=self._query_compiler, _ndim=1) tolist = to_list From 43e3bb5d17fadee6eb2b378aa1a1884a859398fc Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 11 Jan 2023 19:27:44 -0800 Subject: [PATCH 11/42] Fix partial broadcasting issues Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index c588d9df3c6..2d052c15095 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -77,9 +77,17 @@ def _add( dtype=None, subok=True, ): - broadcast = self._ndim != x2._ndim - result = self._query_compiler.add(x2._query_compiler, broadcast=broadcast) - return array(_query_compiler=result) + broadcast = (self._ndim != x2._ndim) + if broadcast: + # Workaround for GH#5529. + caller = x2 if self._ndim == 1 else self + callee = self if self._ndim == 1 else x2 + result = caller._query_compiler.add(callee._query_compiler, broadcast=True) + new_ndim = 2 + else: + result = self._query_compiler.add(x2._query_compiler) + new_ndim = self._ndim + return array(_query_compiler=result, _ndim=new_ndim) def _divide( self, From 4301b9d98b8d5d58ff0f77d5847f6c13574c5ab0 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 12 Jan 2023 15:26:51 -0800 Subject: [PATCH 12/42] Add testing Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 127 ++++++++++++++++++++++++--------- modin/numpy/test/__init__.py | 12 ++++ modin/numpy/test/test_array.py | 60 ++++++++++++++++ 3 files changed, 167 insertions(+), 32 deletions(-) create mode 100644 modin/numpy/test/__init__.py create mode 100644 modin/numpy/test/test_array.py diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 2d052c15095..1c7151993ed 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -36,7 +36,7 @@ def __init__( elif is_list_like(object) and not is_list_like(object[0]): import modin.pandas as pd - qc = pd.DataFrame(object)._query_compiler + qc = pd.Series(object)._query_compiler self._query_compiler = qc self._ndim = 1 else: @@ -53,7 +53,7 @@ def __init__( if self._ndim > 2: ErrorMessage.not_implemented("NumPy arrays with dimensions higher than 2 are not yet supported.") import modin.pandas as pd - self._query_compiler = pd.DataFrame(object)._query_compiler + self._query_compiler = pd.DataFrame(arr)._query_compiler def _absolute( self, @@ -65,7 +65,35 @@ def _absolute( subok=True, ): result = self._query_compiler.abs() - return array(_query_compiler=result) + return array(_query_compiler=result, _ndim=self._ndim) + + __abs__ = _absolute + + def _binary_op(self, other): + broadcast = (self._ndim != other._ndim) + if broadcast: + # In this case, we have a 1D object doing a binary op with a 2D object + caller = self if self._ndim == 2 else other + callee = other if self._ndim == 2 else self + return (caller, callee, caller._ndim, {"broadcast": broadcast, "axis": 1}) + else: + if self.shape != other.shape: + # In this case, we either have two mismatched objects trying to do an operation + # or a nested 1D object that must be broadcasted trying to do an operation. + matched_dimension = None + if self.shape[0] == other.shape[0]: + matched_dimension = 0 + elif self.shape[1] == other.shape[1]: + matched_dimension = 1 + if not matched_dimension is None: + if self.shape[matched_dimension ^ 1] == 1 or other.shape[matched_dimension ^ 1] == 1: + # caller = self if other.shape[matched_dimension ^ 1] == 1 else other + # callee = other if other.shape[matched_dimension ^ 1] == 1 else self + return (self, other, self._ndim, {"broadcast":True, "axis":1}) + else: + raise ValueError(f"operands could not be broadcast together with shapes {self.shape} {other.shape}") + else: + return (self, other, self._ndim, {"broadcast":False}) def _add( self, @@ -77,18 +105,12 @@ def _add( dtype=None, subok=True, ): - broadcast = (self._ndim != x2._ndim) - if broadcast: - # Workaround for GH#5529. - caller = x2 if self._ndim == 1 else self - callee = self if self._ndim == 1 else x2 - result = caller._query_compiler.add(callee._query_compiler, broadcast=True) - new_ndim = 2 - else: - result = self._query_compiler.add(x2._query_compiler) - new_ndim = self._ndim + caller, callee, new_ndim, kwargs = self._binary_op(x2) + result = caller._query_compiler.add(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) + __add__ = _add + def _divide( self, x2, @@ -99,9 +121,14 @@ def _divide( dtype=None, subok=True, ): - result = self._query_compiler.truediv(x2._query_compiler) - return array(_query_compiler=result) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) + if caller != self: + result = result.rtruediv(1) + return array(_query_compiler=result, _ndim=new_ndim) + __truediv__ = _divide + def _float_power( self, x2, @@ -112,8 +139,7 @@ def _float_power( dtype=None, subok=True, ): - result = self._query_compiler.add(x2._query_compiler) - return array(_query_compiler=result) + pass def _floor_divide( self, @@ -125,8 +151,16 @@ def _floor_divide( dtype=None, subok=True, ): - result = self._query_compiler.floordiv(x2._query_compiler) - return array(_query_compiler=result) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller != self: + # No workaround possible until broadcasting fixed. GH#5529. + pass + result = caller._query_compiler.floordiv(callee._query_compiler, **kwargs) + if any(callee._query_compiler.eq(0).to_pandas()): + result = result.replace(numpy.inf, 0) + return array(_query_compiler=result, _ndim=new_ndim) + + __floordiv__ = _floor_divide def _power( self, @@ -138,11 +172,16 @@ def _power( dtype=None, subok=True, ): - result = self._query_compiler.pow(x2._query_compiler) - return array(_query_compiler=result) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller != self: + # No workaround possible until broadcasting fixed. GH#5529. + pass + result = caller._query_compiler.pow(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + + __pow__ = _power def _prod(self, axis=None, out=None, keepdims=None, where=None): - print("Series?", self._query_compiler.is_series_like()) if axis is None: result = self._query_compiler.prod(axis=0).prod(axis=1) return array(_query_compiler=result) @@ -160,8 +199,11 @@ def _multiply( dtype=None, subok=True, ): - result = self._query_compiler.mul(x2._query_compiler) - return array(_query_compiler=result) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + result = caller._query_compiler.mul(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + + __mul__ = _multiply def _remainder( self, @@ -173,8 +215,16 @@ def _remainder( dtype=None, subok=True, ): - result = self._query_compiler.mod(x2._query_compiler) - return array(_query_compiler=result) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller != self: + # No workaround possible until broadcasting fixed. GH#5529. + pass + result = caller._query_compiler.mod(callee._query_compiler, **kwargs) + if any(callee._query_compiler.eq(0).to_pandas()): + result = result.replace(numpy.NaN, 0) + return array(_query_compiler=result, _ndim=new_ndim) + + __mod__ = _remainder def _subtract( self, @@ -186,23 +236,32 @@ def _subtract( dtype=None, subok=True, ): - result = self._query_compiler.sub(x2._query_compiler) - return array(_query_compiler=result) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + result = caller._query_compiler.sub(callee._query_compiler, **kwargs) + if caller != self: + result = result.rsub(0) + return array(_query_compiler=result, _ndim=new_ndim) + + __sub__ = _subtract def _sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None - ): + ): result = self._query_compiler.sum(axis=axis) if dtype is not None: result = result.astype(dtype) if out is not None: out._query_compiler = result return - return array(_query_compiler=result) + if axis is None: + return + else: + new_ndim = self._ndim - 1 + return array(_query_compiler=result, _ndim=new_ndim) def _get_shape(self): if self._ndim == 1: - return (len(self._query_compiler.columns),) + return (len(self._query_compiler.index),) return (len(self._query_compiler.index), len(self._query_compiler.columns)) def _set_shape(self, new_shape): @@ -236,6 +295,7 @@ def _set_shape(self, new_shape): .reset_index(drop=True) .concat(1, qcs, ignore_index=True) ) + self._ndim = 1 else: raise NotImplementedError( "Reshaping from a 2D object to a 2D object is not currently supported!" @@ -244,7 +304,10 @@ def _set_shape(self, new_shape): shape = property(_get_shape, _set_shape) def __repr__(self): + return repr(self._to_numpy()) + + def _to_numpy(self): arr = self._query_compiler.to_numpy() if self._ndim == 1: arr = arr.flatten() - return repr(arr) + return arr diff --git a/modin/numpy/test/__init__.py b/modin/numpy/test/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/numpy/test/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py new file mode 100644 index 00000000000..ef96b89c2fb --- /dev/null +++ b/modin/numpy/test/test_array.py @@ -0,0 +1,60 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import numpy +import pytest +import modin.numpy as np + +@pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) +def test_repr(size): + numpy_arr = numpy.random.randint(-100, 100, size=size) + modin_arr = np.array(numpy_arr) + assert repr(modin_arr) == repr(numpy_arr) + +@pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) +def test_shape(size): + numpy_arr = numpy.random.randint(-100, 100, size=size) + modin_arr = np.array(numpy_arr) + assert modin_arr.shape == numpy_arr.shape + +@pytest.mark.parametrize("operand1shape", [100, (3, 100)]) +@pytest.mark.parametrize("operand2shape", [100, (3, 100)]) +@pytest.mark.parametrize("operator", ["__add__", "__sub__", "__truediv__", "__mul__"]) +def test_basic_arithmetic_with_broadcast(operand1shape, operand2shape, operator): + """Test of operators that support broadcasting.""" + operand1 = numpy.random.randint(-100, 100, size=operand1shape) + operand2 = numpy.random.randint(-100, 100, size=operand2shape) + modin_result = np.array(operand1).__getattribute__(operator)(np.array(operand2)) + numpy_result = operand1.__getattribute__(operator)(operand2) + if operator != "__truediv__": + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result, err_msg=f"Binary Op {operator} failed.") + else: + # Truediv can have precision issues. + numpy.testing.assert_array_almost_equal(modin_result._to_numpy(), numpy_result, decimal=12, err_msg="Binary Op __truediv__ failed.") + +@pytest.mark.parametrize("operator", ["__pow__", "__floordiv__", "__mod__"]) +def test_complex_arithmetic(operator): + """Test of operators that do not yet support broadcasting""" + operand1 = numpy.random.randint(-100, 100, size=100) + lower_bound = -100 if operator != "__pow__" else 0 + operand2 = numpy.random.randint(lower_bound, 100, size=100) + modin_result = np.array(operand1).__getattribute__(operator)(np.array(operand2)) + numpy_result = operand1.__getattribute__(operator)(operand2) + numpy.testing.assert_array_almost_equal(modin_result._to_numpy(), numpy_result, decimal=12, err_msg=f"Binary Op {operator} failed on 1D arrays.") + + operand1 = numpy.random.randint(-100, 100, size=(10, 10)) + lower_bound = -100 if operator != "__pow__" else 0 + operand2 = numpy.random.randint(lower_bound, 100, size=(10, 10)) + modin_result = np.array(operand1).__getattribute__(operator)(np.array(operand2)) + numpy_result = operand1.__getattribute__(operator)(operand2) + numpy.testing.assert_array_almost_equal(modin_result._to_numpy(), numpy_result, decimal=12, err_msg=f"Binary Op {operator} failed on 2D arrays.") From 5ceca0264113795c6836f4d8c82e32da4297b118 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 12 Jan 2023 15:38:47 -0800 Subject: [PATCH 13/42] Add tests to CI Signed-off-by: Rehan Durrani --- .github/workflows/push-to-master.yml | 2 + .github/workflows/push.yml | 2 + modin/numpy/arr.py | 66 ++++++++++++++++++---------- modin/numpy/constants.py | 1 + 4 files changed, 48 insertions(+), 23 deletions(-) diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml index 73a66367e05..3886886b596 100644 --- a/.github/workflows/push-to-master.yml +++ b/.github/workflows/push-to-master.yml @@ -55,6 +55,7 @@ jobs: python -m pytest modin/pandas/test/dataframe/test_udf.py python -m pytest modin/pandas/test/dataframe/test_window.py python -m pytest modin/pandas/test/test_series.py + python -m pytest modin/numpy/test/test_array.py python -m pytest modin/pandas/test/test_rolling.py python -m pytest modin/pandas/test/test_concat.py python -m pytest modin/pandas/test/test_groupby.py @@ -121,6 +122,7 @@ jobs: - modin/pandas/test/dataframe/test_window.py - modin/pandas/test/dataframe/test_pickle.py - modin/pandas/test/test_series.py + - modin/numpy/test/test_array.py - modin/pandas/test/test_rolling.py - modin/pandas/test/test_concat.py - modin/pandas/test/test_groupby.py diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 9e3f36c1ab2..bc01666948d 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -297,6 +297,7 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_window.py - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py - run: python -m pytest -n 2 modin/pandas/test/test_series.py + - run: python -m pytest -n 2 modin/numpy/test/test_array.py - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py - run: python -m pytest -n 2 modin/pandas/test/test_concat.py if: matrix.engine == 'python' @@ -334,6 +335,7 @@ jobs: - modin/pandas/test/dataframe/test_window.py - modin/pandas/test/dataframe/test_pickle.py - modin/pandas/test/test_series.py + - modin/numpy/test/test_array.py - modin/pandas/test/test_rolling.py - modin/pandas/test/test_concat.py - modin/pandas/test/test_groupby.py diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 1c7151993ed..f5b6ffa34e7 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.common import is_list_like from modin.error_message import ErrorMessage + class array(object): def __init__( self, @@ -40,19 +41,33 @@ def __init__( self._query_compiler = qc self._ndim = 1 else: - expected_kwargs = {"dtype":None, "copy":True,"order":"K","subok":False,"ndmin":0,"like":None} - rcvd_kwargs = {"dtype":dtype, "copy":copy,"order":order,"subok":subok,"ndmin":ndmin,"like":like} + expected_kwargs = { + "dtype": None, + "copy": True, + "order": "K", + "subok": False, + "ndmin": 0, + "like": None, + } + rcvd_kwargs = { + "dtype": dtype, + "copy": copy, + "order": order, + "subok": subok, + "ndmin": ndmin, + "like": like, + } for key, value in rcvd_kwargs.copy().items(): if value == expected_kwargs[key]: rcvd_kwargs.pop(key) - arr = numpy.array( - object, - **rcvd_kwargs - ) + arr = numpy.array(object, **rcvd_kwargs) self._ndim = len(arr.shape) if self._ndim > 2: - ErrorMessage.not_implemented("NumPy arrays with dimensions higher than 2 are not yet supported.") + ErrorMessage.not_implemented( + "NumPy arrays with dimensions higher than 2 are not yet supported." + ) import modin.pandas as pd + self._query_compiler = pd.DataFrame(arr)._query_compiler def _absolute( @@ -66,11 +81,11 @@ def _absolute( ): result = self._query_compiler.abs() return array(_query_compiler=result, _ndim=self._ndim) - + __abs__ = _absolute - + def _binary_op(self, other): - broadcast = (self._ndim != other._ndim) + broadcast = self._ndim != other._ndim if broadcast: # In this case, we have a 1D object doing a binary op with a 2D object caller = self if self._ndim == 2 else other @@ -85,15 +100,20 @@ def _binary_op(self, other): matched_dimension = 0 elif self.shape[1] == other.shape[1]: matched_dimension = 1 - if not matched_dimension is None: - if self.shape[matched_dimension ^ 1] == 1 or other.shape[matched_dimension ^ 1] == 1: + if matched_dimension is not None: + if ( + self.shape[matched_dimension ^ 1] == 1 + or other.shape[matched_dimension ^ 1] == 1 + ): # caller = self if other.shape[matched_dimension ^ 1] == 1 else other # callee = other if other.shape[matched_dimension ^ 1] == 1 else self - return (self, other, self._ndim, {"broadcast":True, "axis":1}) + return (self, other, self._ndim, {"broadcast": True, "axis": 1}) else: - raise ValueError(f"operands could not be broadcast together with shapes {self.shape} {other.shape}") + raise ValueError( + f"operands could not be broadcast together with shapes {self.shape} {other.shape}" + ) else: - return (self, other, self._ndim, {"broadcast":False}) + return (self, other, self._ndim, {"broadcast": False}) def _add( self, @@ -128,7 +148,7 @@ def _divide( return array(_query_compiler=result, _ndim=new_ndim) __truediv__ = _divide - + def _float_power( self, x2, @@ -139,7 +159,7 @@ def _float_power( dtype=None, subok=True, ): - pass + pass def _floor_divide( self, @@ -159,7 +179,7 @@ def _floor_divide( if any(callee._query_compiler.eq(0).to_pandas()): result = result.replace(numpy.inf, 0) return array(_query_compiler=result, _ndim=new_ndim) - + __floordiv__ = _floor_divide def _power( @@ -178,7 +198,7 @@ def _power( pass result = caller._query_compiler.pow(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) - + __pow__ = _power def _prod(self, axis=None, out=None, keepdims=None, where=None): @@ -241,12 +261,12 @@ def _subtract( if caller != self: result = result.rsub(0) return array(_query_compiler=result, _ndim=new_ndim) - + __sub__ = _subtract def _sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None - ): + ): result = self._query_compiler.sum(axis=axis) if dtype is not None: result = result.astype(dtype) @@ -254,7 +274,7 @@ def _sum( out._query_compiler = result return if axis is None: - return + return else: new_ndim = self._ndim - 1 return array(_query_compiler=result, _ndim=new_ndim) @@ -305,7 +325,7 @@ def _set_shape(self, new_shape): def __repr__(self): return repr(self._to_numpy()) - + def _to_numpy(self): arr = self._query_compiler.to_numpy() if self._ndim == 1: diff --git a/modin/numpy/constants.py b/modin/numpy/constants.py index 794a003754b..0d7576516f9 100644 --- a/modin/numpy/constants.py +++ b/modin/numpy/constants.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +# flake8: noqa from numpy import ( Inf, Infinity, From ff9045c3b1b576281130f714b9318b2500db845b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 2 Feb 2023 12:51:20 -0800 Subject: [PATCH 14/42] Add __array_ufunc__, __array_function__, and clean up implementation of array class, add ravel and flatten as well Signed-off-by: Rehan Durrani --- modin/config/envvars.py | 2 + modin/numpy/__init__.py | 2 +- modin/numpy/arr.py | 417 ++++++++++++++++++++++++++------- modin/numpy/math.py | 156 +++--------- modin/numpy/test/test_array.py | 103 ++++++-- modin/pandas/base.py | 3 +- modin/pandas/series.py | 1 + 7 files changed, 449 insertions(+), 235 deletions(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 28c23214e1a..8059375e507 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -627,12 +627,14 @@ class TestReadFromPostgres(EnvironmentVariable, type=bool): varname = "MODIN_TEST_READ_FROM_POSTGRES" default = False + class ExperimentalNumPyAPI(EnvironmentVariable, type=bool): """Set to true to use Modin's experimental NumPy API.""" varname = "MODIN_EXPERIMENTAL_NUMPY_API" default = False + class ReadSqlEngine(EnvironmentVariable, type=str): """Engine to run `read_sql`.""" diff --git a/modin/numpy/__init__.py b/modin/numpy/__init__.py index 903ae6b9948..75a801f23fc 100644 --- a/modin/numpy/__init__.py +++ b/modin/numpy/__init__.py @@ -11,6 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -from .arr import * +from .arr import array from .math import * from .constants import * diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index f5b6ffa34e7..32723067199 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -10,14 +10,39 @@ # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +"""Module houses ``array`` class, that is distributed version of ``numpy.array``.""" +from math import prod import numpy - from pandas.core.dtypes.common import is_list_like +from pandas.api.types import is_scalar +from inspect import signature + +import modin.pandas as pd from modin.error_message import ErrorMessage +from modin.core.dataframe.algebra import ( + Map, + Reduce, + Binary, +) + + +_INTEROPERABLE_TYPES = (pd.DataFrame, pd.Series) class array(object): + """ + Modin distributed representation of ``numpy.array``. + + Internally, the data can be divided into partitions along both columns and rows + in order to parallelize computations and utilize the user's hardware as much as possible. + + Notes + ----- + The ``array`` class is a lightweight shim that relies on the pandas Query Compiler in order to + provide functionality. + """ + def __init__( self, object=None, @@ -27,7 +52,7 @@ def __init__( order="K", subok=False, ndmin=0, - like=None, + like=numpy._NoValue, _query_compiler=None, _ndim=None, ): @@ -35,42 +60,138 @@ def __init__( self._query_compiler = _query_compiler self._ndim = _ndim elif is_list_like(object) and not is_list_like(object[0]): - import modin.pandas as pd - - qc = pd.Series(object)._query_compiler - self._query_compiler = qc + self._query_compiler = pd.Series(object)._query_compiler self._ndim = 1 else: - expected_kwargs = { + target_kwargs = { "dtype": None, "copy": True, "order": "K", "subok": False, "ndmin": 0, - "like": None, - } - rcvd_kwargs = { - "dtype": dtype, - "copy": copy, - "order": order, - "subok": subok, - "ndmin": ndmin, - "like": like, + "like": numpy._NoValue, } - for key, value in rcvd_kwargs.copy().items(): - if value == expected_kwargs[key]: - rcvd_kwargs.pop(key) - arr = numpy.array(object, **rcvd_kwargs) + for key, value in target_kwargs.copy().items(): + if value == locals()[key]: + target_kwargs.pop(key) + else: + target_kwargs[key] = locals()[key] + arr = numpy.array(object, **target_kwargs) self._ndim = len(arr.shape) if self._ndim > 2: ErrorMessage.not_implemented( "NumPy arrays with dimensions higher than 2 are not yet supported." ) - import modin.pandas as pd self._query_compiler = pd.DataFrame(arr)._query_compiler - - def _absolute( + # These two lines are necessary so that our query compiler does not keep track of indices + # and try to map like indices to like indices. (e.g. if we multiply two arrays that used + # to be dataframes, and the dataframes had the same column names but ordered differently + # we want to do a simple broadcast where we only consider position, as numpy would, rather + # than pair columns with the same name and multiply them.) + self._query_compiler = self._query_compiler.reset_index(drop=True) + self._query_compiler.columns = range(len(self._query_compiler.columns)) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + ufunc_name = ufunc.__name__ + supported_array_layer = hasattr(self, ufunc_name) or hasattr( + self, f"__{ufunc_name}__" + ) + if supported_array_layer: + args = [] + for input in inputs: + if not (isinstance(input, array) or is_scalar(input)): + if isinstance(input, _INTEROPERABLE_TYPES): + ndim = 2 if isinstance(input, pd.DataFrame) else 1 + input = array(_query_compiler=input._query_compiler, _ndim=ndim) + else: + input = array(input) + args += [input] + function = ( + getattr(args[0], ufunc_name) + if hasattr(args[0], ufunc_name) + else getattr(args[0], f"__{ufunc_name}__") + ) + len_expected_arguments = len( + [ + param + for param in signature(function).parameters.values() + if param.kind == param.POSITIONAL_ONLY + ] + ) + if len_expected_arguments == len(args): + return function(*tuple(args[1:]), **kwargs) + else: + ErrorMessage.single_warning( + f"{ufunc} method {method} is not yet supported in Modin. Defaulting to NumPy." + ) + args = [] + for input in inputs: + if isinstance(input, array): + input = input._to_numpy() + if isinstance(input, pd.DataFrame): + input = input._query_compiler.to_numpy() + if isinstance(input, pd.Series): + input = input._query_compiler.to_numpy().flatten() + args += [input] + output = args[0].__array_ufunc__(ufunc, method, *args, **kwargs) + if is_scalar(output): + return output + return array(output) + new_ufunc = None + out_ndim = -1 + if method == "__call__": + if len(inputs) == 1: + new_ufunc = Map.register(ufunc) + out_ndim = len(inputs[0].shape) + else: + new_ufunc = Binary.register(ufunc) + out_ndim = max([len(inp.shape) for inp in inputs]) + elif method == "reduce": + new_ufunc = Reduce.register(ufunc, axis=kwargs.get("axis", None)) + if kwargs.get("axis", None) is None: + out_ndim = 0 + else: + out_ndim = len(inputs[0].shape) - 1 + elif method == "accumulate": + new_ufunc = Reduce.register(ufunc, axis=None) + out_ndim = 0 + if new_ufunc is None: + ErrorMessage.single_warning( + f"{ufunc} is not yet supported in Modin. Defaulting to NumPy." + ) + args = [] + for input in inputs: + if isinstance(input, array): + input = input._to_numpy() + if isinstance(input, pd.DataFrame): + input = input._query_compiler.to_numpy() + if isinstance(input, pd.Series): + input = input._query_compiler.to_numpy().flatten() + args += [input] + output = ufunc(*args, **kwargs) + if is_scalar(output): + return output + return array(output) + args = [] + for input in inputs: + if not (isinstance(input, array) or is_scalar(input)): + if isinstance(input, _INTEROPERABLE_TYPES): + ndim = 2 if isinstance(input, pd.DataFrame) else 1 + input = array(_query_compiler=input._query_compiler, _ndim=ndim) + else: + input = array(input) + args += [ + input._query_compiler if hasattr(input, "_query_compiler") else input + ] + return array(_query_compiler=new_ufunc(*args, **kwargs), _ndim=out_ndim) + + def __array_function__(self, func, types, args, kwargs): + if func.__name__ == "ravel": + return self.flatten() + return NotImplemented + + def __abs__( self, out=None, where=True, @@ -82,32 +203,43 @@ def _absolute( result = self._query_compiler.abs() return array(_query_compiler=result, _ndim=self._ndim) - __abs__ = _absolute + absolute = __abs__ def _binary_op(self, other): + if not isinstance(other, array): + if isinstance(other, _INTEROPERABLE_TYPES): + ndim = 2 if isinstance(other, pd.DataFrame) else 1 + other = array(_query_compiler=other._query_compiler, _ndim=ndim) + else: + raise TypeError( + f"Unsupported operand type(s) for divide: '{type(self)}' and '{type(other)}'" + ) broadcast = self._ndim != other._ndim if broadcast: # In this case, we have a 1D object doing a binary op with a 2D object - caller = self if self._ndim == 2 else other - callee = other if self._ndim == 2 else self + caller, callee = (self, other) if self._ndim == 2 else (other, self) + if callee.shape[0] != caller.shape[1]: + raise ValueError( + f"operands could not be broadcast together with shapes {self.shape} {other.shape}" + ) return (caller, callee, caller._ndim, {"broadcast": broadcast, "axis": 1}) else: if self.shape != other.shape: # In this case, we either have two mismatched objects trying to do an operation # or a nested 1D object that must be broadcasted trying to do an operation. - matched_dimension = None if self.shape[0] == other.shape[0]: matched_dimension = 0 elif self.shape[1] == other.shape[1]: matched_dimension = 1 - if matched_dimension is not None: - if ( - self.shape[matched_dimension ^ 1] == 1 - or other.shape[matched_dimension ^ 1] == 1 - ): - # caller = self if other.shape[matched_dimension ^ 1] == 1 else other - # callee = other if other.shape[matched_dimension ^ 1] == 1 else self - return (self, other, self._ndim, {"broadcast": True, "axis": 1}) + else: + raise ValueError( + f"operands could not be broadcast together with shapes {self.shape} {other.shape}" + ) + if ( + self.shape[matched_dimension ^ 1] == 1 + or other.shape[matched_dimension ^ 1] == 1 + ): + return (self, other, self._ndim, {"broadcast": True, "axis": 1}) else: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {other.shape}" @@ -115,7 +247,7 @@ def _binary_op(self, other): else: return (self, other, self._ndim, {"broadcast": False}) - def _add( + def __add__( self, x2, out=None, @@ -125,13 +257,25 @@ def _add( dtype=None, subok=True, ): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.add(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) result = caller._query_compiler.add(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) - __add__ = _add + def __radd__( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): + return self._add(x2, out, where, casting, order, dtype, subok) - def _divide( + def divide( self, x2, out=None, @@ -141,15 +285,23 @@ def _divide( dtype=None, subok=True, ): + if is_scalar(x2): + return array( + _query_compiler=self._query_compiler.truediv(x2), _ndim=self._ndim + ) caller, callee, new_ndim, kwargs = self._binary_op(x2) - result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) if caller != self: - result = result.rtruediv(1) + # In this case, we are doing an operation that looks like this 1D_object/2D_object. + # For Modin to broadcast directly, we have to swap it so that the operation is actually + # 2D_object.rtruediv(1D_object). + result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) - __truediv__ = _divide + __truediv__ = divide - def _float_power( + def __rtruediv__( self, x2, out=None, @@ -159,9 +311,18 @@ def _float_power( dtype=None, subok=True, ): - pass + if is_scalar(x2): + return array( + _query_compiler=self._query_compiler.rtruediv(x2), _ndim=self._ndim + ) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller != self: + result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) - def _floor_divide( + def floor_divide( self, x2, out=None, @@ -171,18 +332,31 @@ def _floor_divide( dtype=None, subok=True, ): + if is_scalar(x2): + result = self._query_compiler.floordiv(x2) + if x2 == 0: + # NumPy's floor_divide by 0 works differently from pandas', so we need to fix + # the output. + result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) + return array(_query_compiler=result, _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller != self: - # No workaround possible until broadcasting fixed. GH#5529. - pass + # Modin does not correctly support broadcasting when the caller of the function is + # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using + # commutativity, and `rfloordiv` also works incorrectly. GH#5529 + raise NotImplementedError( + "Using floor_divide with broadcast is not currently available in Modin." + ) result = caller._query_compiler.floordiv(callee._query_compiler, **kwargs) - if any(callee._query_compiler.eq(0).to_pandas()): - result = result.replace(numpy.inf, 0) + if any(callee._query_compiler.eq(0).any()): + # NumPy's floor_divide by 0 works differently from pandas', so we need to fix + # the output. + result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) return array(_query_compiler=result, _ndim=new_ndim) - __floordiv__ = _floor_divide + __floordiv__ = floor_divide - def _power( + def power( self, x2, out=None, @@ -192,24 +366,32 @@ def _power( dtype=None, subok=True, ): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.pow(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller != self: - # No workaround possible until broadcasting fixed. GH#5529. - pass + # Modin does not correctly support broadcasting when the caller of the function is + # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using + # commutativity, and `rpow` also works incorrectly. GH#5529 + raise NotImplementedError( + "Using power with broadcast is not currently available in Modin." + ) result = caller._query_compiler.pow(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) - __pow__ = _power + __pow__ = power - def _prod(self, axis=None, out=None, keepdims=None, where=None): + def prod(self, axis=None, out=None, keepdims=None, where=None): if axis is None: result = self._query_compiler.prod(axis=0).prod(axis=1) - return array(_query_compiler=result) + return result.to_numpy()[0, 0] else: result = self._query_compiler.prod(axis=axis) - return array(_query_compiler=result) + if self._ndim == 1: + return result.to_numpy()[0, 0] + return array(_query_compiler=result, _ndim=1) - def _multiply( + def multiply( self, x2, out=None, @@ -219,13 +401,27 @@ def _multiply( dtype=None, subok=True, ): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.mul(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) result = caller._query_compiler.mul(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) - __mul__ = _multiply + __mul__ = multiply + + def __rmul__( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): + return self._multiply(x2, out, where, casting, order, dtype, subok) - def _remainder( + def remainder( self, x2, out=None, @@ -235,18 +431,33 @@ def _remainder( dtype=None, subok=True, ): + if is_scalar(x2): + result = array( + _query_compiler=self._query_compiler.mod(x2), _ndim=self._ndim + ) + if x2 == 0: + # NumPy's remainder by 0 works differently from pandas', so we need to fix + # the output. + result._query_compiler = result._query_compiler.replace(numpy.NaN, 0) + return result caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller != self: - # No workaround possible until broadcasting fixed. GH#5529. - pass + # Modin does not correctly support broadcasting when the caller of the function is + # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using + # commutativity, and `rmod` also works incorrectly. GH#5529 + raise NotImplementedError( + "Using remainder with broadcast is not currently available in Modin." + ) result = caller._query_compiler.mod(callee._query_compiler, **kwargs) - if any(callee._query_compiler.eq(0).to_pandas()): + if any(callee._query_compiler.eq(0).any()): + # NumPy's floor_divide by 0 works differently from pandas', so we need to fix + # the output. result = result.replace(numpy.NaN, 0) return array(_query_compiler=result, _ndim=new_ndim) - __mod__ = _remainder + __mod__ = remainder - def _subtract( + def subtract( self, x2, out=None, @@ -256,29 +467,69 @@ def _subtract( dtype=None, subok=True, ): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.sub(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) - result = caller._query_compiler.sub(callee._query_compiler, **kwargs) if caller != self: - result = result.rsub(0) + # In this case, we are doing an operation that looks like this 1D_object - 2D_object. + # For Modin to broadcast directly, we have to swap it so that the operation is actually + # 2D_object.rsub(1D_object). + result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.sub(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) - __sub__ = _subtract + __sub__ = subtract + + def __rsub__( + self, + x2, + out=None, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, + ): + if is_scalar(x2): + return array( + _query_compiler=self._query_compiler.rsub(x2), _ndim=self._ndim + ) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller != self: + # In this case, we are doing an operation that looks like this 1D_object - 2D_object. + # For Modin to broadcast directly, we have to swap it so that the operation is actually + # 2D_object.sub(1D_object). + result = caller._query_compiler.sub(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) - def _sum( + def sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None ): result = self._query_compiler.sum(axis=axis) + new_ndim = self._ndim - 1 + if axis is None or new_ndim == 0: + return result.to_numpy()[0, 0] if dtype is not None: result = result.astype(dtype) - if out is not None: - out._query_compiler = result - return - if axis is None: - return - else: - new_ndim = self._ndim - 1 return array(_query_compiler=result, _ndim=new_ndim) + def flatten(self, order="C"): + qcs = [ + self._query_compiler.getitem_row_array([index_val]).reset_index(drop=True) + for index_val in self._query_compiler.index[1:] + ] + new_query_compiler = ( + self._query_compiler.getitem_row_array([self._query_compiler.index[0]]) + .reset_index(drop=True) + .concat(1, qcs, ignore_index=True) + ) + new_query_compiler.columns = range(len(new_query_compiler.columns)) + new_ndim = 1 + return array(_query_compiler=new_query_compiler, _ndim=new_ndim) + def _get_shape(self): if self._ndim == 1: return (len(self._query_compiler.index),) @@ -295,26 +546,14 @@ def _set_shape(self, new_shape): raise TypeError( f"'{type(dim)}' object cannot be interpreted as an integer" ) - from math import prod new_dimensions = new_shape if isinstance(new_shape, int) else prod(new_shape) if new_dimensions != prod(self._get_shape()): raise ValueError( - f"cannot reshape array of size {prod(self._get_shape)} into {new_shape if isinstance(new_shape, tuple) else (new_shape,)}" + f"cannot reshape array of size {prod(self._get_shape())} into {new_shape if isinstance(new_shape, tuple) else (new_shape,)}" ) if isinstance(new_shape, int): - qcs = [] - for index_val in self._query_compiler.index[1:]: - qcs.append( - self._query_compiler.getitem_row_array([index_val]).reset_index( - drop=True - ) - ) - self._query_compiler = ( - self._query_compiler.getitem_row_array([self._query_compiler.index[0]]) - .reset_index(drop=True) - .concat(1, qcs, ignore_index=True) - ) + self._query_compiler = self.flatten()._query_compiler self._ndim = 1 else: raise NotImplementedError( diff --git a/modin/numpy/math.py b/modin/numpy/math.py index bf6061eca80..e2cf5f7c5a5 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -11,19 +11,14 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import numpy - def absolute( x, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x, "_absolute"): - return x._absolute( + if hasattr(x, "absolute"): + return x.absolute( out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok ) - return numpy.absolute( - x, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok - ) abs = absolute @@ -32,8 +27,8 @@ def absolute( def add( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_add"): - return x1._add( + if hasattr(x1, "add"): + return x1.add( x2, out=out, where=where, @@ -42,29 +37,18 @@ def add( dtype=dtype, subok=subok, ) - return numpy.add( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) def all(a, axis=None, out=None, keepdims=None, where=None): - if hasattr(a, "_all"): - return a._all(axis=axis, out=out, keepdims=keepdims, where=where) - return numpy.all(a, axis=axis, out=out, keepdims=keepdims, where=where) + if hasattr(a, "all"): + return a.all(axis=axis, out=out, keepdims=keepdims, where=where) def divide( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_divide"): - return x1._divide( + if hasattr(x1, "divide"): + return x1.divide( x2, out=out, where=where, @@ -73,23 +57,13 @@ def divide( dtype=dtype, subok=subok, ) - return numpy.divide( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) def float_power( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_float_power"): - return x1._float_power( + if hasattr(x1, "float_power"): + return x1.float_power( x2, out=out, where=where, @@ -98,23 +72,13 @@ def float_power( dtype=dtype, subok=subok, ) - return numpy.float_power( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) def floor_divide( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_floor_divide"): - return x1._floor_divide( + if hasattr(x1, "floor_divide"): + return x1.floor_divide( x2, out=out, where=where, @@ -123,23 +87,13 @@ def floor_divide( dtype=dtype, subok=subok, ) - return numpy.floor_divide( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) def power( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_power"): - return x1._power( + if hasattr(x1, "power"): + return x1.power( x2, out=out, where=where, @@ -148,29 +102,18 @@ def power( dtype=dtype, subok=subok, ) - return numpy.power( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) def prod(a, axis=None, out=None, keepdims=None, where=None): - if hasattr(a, "_prod"): - return a._prod(axis=axis, out=out, keepdims=keepdims, where=where) - return numpy.prod(a, axis=axis, out=out, keepdims=keepdims, where=where) + if hasattr(a, "prod"): + return a.prod(axis=axis, out=out, keepdims=keepdims, where=where) def multiply( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_multiply"): - return x1._multiply( + if hasattr(x1, "multiply"): + return x1.multiply( x2, out=out, where=where, @@ -179,23 +122,13 @@ def multiply( dtype=dtype, subok=subok, ) - return numpy.multiply( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) def remainder( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_remainder"): - return x1._remainder( + if hasattr(x1, "remainder"): + return x1.remainder( x2, out=out, where=where, @@ -204,16 +137,6 @@ def remainder( dtype=dtype, subok=subok, ) - return numpy.remainder( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) mod = remainder @@ -222,8 +145,8 @@ def remainder( def subtract( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_subtract"): - return x1._subtract( + if hasattr(x1, "subtract"): + return x1.subtract( x2, out=out, where=where, @@ -232,30 +155,18 @@ def subtract( dtype=dtype, subok=subok, ) - return numpy.subtract( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): - if hasattr(arr, "_sum"): - return arr._sum(axis) - else: - return numpy.sum(arr) + if hasattr(arr, "sum"): + return arr.sum(axis) def true_divide( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "_divide"): - return x1._divide( + if hasattr(x1, "divide"): + return x1.divide( x2, out=out, where=where, @@ -264,13 +175,8 @@ def true_divide( dtype=dtype, subok=subok, ) - return numpy.divide( - x1, - x2, - out=out, - where=where, - casting=casting, - order=order, - dtype=dtype, - subok=subok, - ) + + +def ravel(arr, order="C"): + if hasattr(arr, "flatten"): + return arr.flatten(order) diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index ef96b89c2fb..e3b963a2290 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -15,46 +15,111 @@ import pytest import modin.numpy as np + @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_repr(size): numpy_arr = numpy.random.randint(-100, 100, size=size) modin_arr = np.array(numpy_arr) assert repr(modin_arr) == repr(numpy_arr) + @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_shape(size): numpy_arr = numpy.random.randint(-100, 100, size=size) modin_arr = np.array(numpy_arr) assert modin_arr.shape == numpy_arr.shape + @pytest.mark.parametrize("operand1shape", [100, (3, 100)]) @pytest.mark.parametrize("operand2shape", [100, (3, 100)]) -@pytest.mark.parametrize("operator", ["__add__", "__sub__", "__truediv__", "__mul__"]) +@pytest.mark.parametrize( + "operator", + [ + "__add__", + "__sub__", + "__truediv__", + "__mul__", + "__rtruediv__", + "__rmul__", + "__radd__", + "__rsub__", + ], +) def test_basic_arithmetic_with_broadcast(operand1shape, operand2shape, operator): """Test of operators that support broadcasting.""" operand1 = numpy.random.randint(-100, 100, size=operand1shape) operand2 = numpy.random.randint(-100, 100, size=operand2shape) - modin_result = np.array(operand1).__getattribute__(operator)(np.array(operand2)) - numpy_result = operand1.__getattribute__(operator)(operand2) - if operator != "__truediv__": - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result, err_msg=f"Binary Op {operator} failed.") + modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) + numpy_result = getattr(operand1, operator)(operand2) + if operator not in ["__truediv__", "__rtruediv__"]: + numpy.testing.assert_array_equal( + modin_result._to_numpy(), + numpy_result, + err_msg=f"Binary Op {operator} failed.", + ) else: # Truediv can have precision issues. - numpy.testing.assert_array_almost_equal(modin_result._to_numpy(), numpy_result, decimal=12, err_msg="Binary Op __truediv__ failed.") + numpy.testing.assert_array_almost_equal( + modin_result._to_numpy(), + numpy_result, + decimal=12, + err_msg="Binary Op __truediv__ failed.", + ) + @pytest.mark.parametrize("operator", ["__pow__", "__floordiv__", "__mod__"]) def test_complex_arithmetic(operator): """Test of operators that do not yet support broadcasting""" - operand1 = numpy.random.randint(-100, 100, size=100) - lower_bound = -100 if operator != "__pow__" else 0 - operand2 = numpy.random.randint(lower_bound, 100, size=100) - modin_result = np.array(operand1).__getattribute__(operator)(np.array(operand2)) - numpy_result = operand1.__getattribute__(operator)(operand2) - numpy.testing.assert_array_almost_equal(modin_result._to_numpy(), numpy_result, decimal=12, err_msg=f"Binary Op {operator} failed on 1D arrays.") - - operand1 = numpy.random.randint(-100, 100, size=(10, 10)) - lower_bound = -100 if operator != "__pow__" else 0 - operand2 = numpy.random.randint(lower_bound, 100, size=(10, 10)) - modin_result = np.array(operand1).__getattribute__(operator)(np.array(operand2)) - numpy_result = operand1.__getattribute__(operator)(operand2) - numpy.testing.assert_array_almost_equal(modin_result._to_numpy(), numpy_result, decimal=12, err_msg=f"Binary Op {operator} failed on 2D arrays.") + for size, textdim in ((100, "1D"), ((10, 10), "2D")): + operand1 = numpy.random.randint(-100, 100, size=size) + lower_bound = -100 if operator != "__pow__" else 0 + operand2 = numpy.random.randint(lower_bound, 100, size=size) + modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) + numpy_result = getattr(operand1, operator)(operand2) + numpy.testing.assert_array_almost_equal( + modin_result._to_numpy(), + numpy_result, + decimal=12, + err_msg=f"Binary Op {operator} failed on {textdim} arrays.", + ) + + +@pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) +def test_scalar_arithmetic(size): + numpy_arr = numpy.random.randint(-100, 100, size=size) + modin_arr = np.array(numpy_arr) + scalar = numpy.random.randint(1, 100) + numpy.testing.assert_array_equal( + (scalar * modin_arr)._to_numpy(), scalar * numpy_arr, err_msg=f"__mul__ failed." + ) + numpy.testing.assert_array_equal( + (modin_arr * scalar)._to_numpy(), + scalar * numpy_arr, + err_msg=f"__rmul__ failed.", + ) + numpy.testing.assert_array_equal( + (scalar / modin_arr)._to_numpy(), + scalar / numpy_arr, + err_msg=f"__rtruediv__ failed.", + ) + numpy.testing.assert_array_equal( + (modin_arr / scalar)._to_numpy(), + numpy_arr / scalar, + err_msg=f"__truediv__ failed.", + ) + numpy.testing.assert_array_equal( + (scalar + modin_arr)._to_numpy(), + scalar + numpy_arr, + err_msg=f"__radd__ failed.", + ) + numpy.testing.assert_array_equal( + (modin_arr + scalar)._to_numpy(), scalar + numpy_arr, err_msg=f"__add__ failed." + ) + numpy.testing.assert_array_equal( + (scalar - modin_arr)._to_numpy(), + scalar - numpy_arr, + err_msg=f"__rsub__ failed.", + ) + numpy.testing.assert_array_equal( + (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg=f"__sub__ failed." + ) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 9d48125af70..5296654d304 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3243,11 +3243,12 @@ def to_numpy( Convert the `BasePandasDataset` to a NumPy array. """ from modin.config import ExperimentalNumPyAPI + if ExperimentalNumPyAPI.get(): from ..numpy.arr import array return array(_query_compiler=self._query_compiler, _ndim=2) - + return self._query_compiler.to_numpy( dtype=dtype, copy=copy, diff --git a/modin/pandas/series.py b/modin/pandas/series.py index b0022b1f0b0..49aae1d1232 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1949,6 +1949,7 @@ def to_numpy( Return the NumPy ndarray representing the values in this Series or Index. """ from modin.config import ExperimentalNumPyAPI + if not ExperimentalNumPyAPI.get(): return ( super(Series, self) From 4b174de1ca930639529bca41d291c00c0d743ec0 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 2 Feb 2023 18:31:43 -0800 Subject: [PATCH 15/42] Add where Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 235 ++++++++++++++++++++++++++++++++++++++------ modin/numpy/math.py | 11 +++ 2 files changed, 218 insertions(+), 28 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 32723067199..9a298d56b30 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -14,7 +14,7 @@ from math import prod import numpy -from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.common import is_list_like, is_numeric_dtype from pandas.api.types import is_scalar from inspect import signature @@ -30,6 +30,15 @@ _INTEROPERABLE_TYPES = (pd.DataFrame, pd.Series) +def try_convert_from_interoperable_type(obj): + if isinstance(obj, _INTEROPERABLE_TYPES): + obj = array( + _query_compiler=obj._query_compiler, + _ndim=2 if isinstance(obj, pd.DataFrame) else 1, + ) + return obj + + class array(object): """ Modin distributed representation of ``numpy.array``. @@ -60,7 +69,8 @@ def __init__( self._query_compiler = _query_compiler self._ndim = _ndim elif is_list_like(object) and not is_list_like(object[0]): - self._query_compiler = pd.Series(object)._query_compiler + series = pd.Series(object) + self._query_compiler = series._query_compiler self._ndim = 1 else: target_kwargs = { @@ -100,12 +110,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if supported_array_layer: args = [] for input in inputs: + input = try_convert_from_interoperable_type(input) if not (isinstance(input, array) or is_scalar(input)): - if isinstance(input, _INTEROPERABLE_TYPES): - ndim = 2 if isinstance(input, pd.DataFrame) else 1 - input = array(_query_compiler=input._query_compiler, _ndim=ndim) - else: - input = array(input) + input = array(input) args += [input] function = ( getattr(args[0], ufunc_name) @@ -175,12 +182,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return array(output) args = [] for input in inputs: + input = try_convert_from_interoperable_type(input) if not (isinstance(input, array) or is_scalar(input)): - if isinstance(input, _INTEROPERABLE_TYPES): - ndim = 2 if isinstance(input, pd.DataFrame) else 1 - input = array(_query_compiler=input._query_compiler, _ndim=ndim) - else: - input = array(input) + input = array(input) args += [ input._query_compiler if hasattr(input, "_query_compiler") else input ] @@ -189,8 +193,92 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def __array_function__(self, func, types, args, kwargs): if func.__name__ == "ravel": return self.flatten() + if func.__name__ == "where": + return self.where(*list(args)[1:]) return NotImplemented + def where(self, x=None, y=None): + if x is None and y is None: + ErrorMessage.single_warning( + f"np.where method with only condition specified is not yet supported in Modin. Defaulting to NumPy." + ) + condition = self._to_numpy() + return array(numpy.where(condition)) + x, y = try_convert_from_interoperable_type( + x + ), try_convert_from_interoperable_type(y) + if not ( + (isinstance(x, array) or is_scalar(x)) + and (isinstance(y, array) or is_scalar(y)) + ): + raise ValueError( + "np.where requires x and y to either be np.arrays or scalars." + ) + if is_scalar(x) and is_scalar(y): + ErrorMessage.single_warning( + "np.where not supported when both x and y are scalars. Defaulting to NumPy." + ) + return array(numpy.where(self._query_compiler.to_pandas(), x, y)) + if is_scalar(x) and not is_scalar(y): + if self._ndim < y._ndim: + if not self.shape[0] == y.shape[1]: + raise ValueError( + f"operands could not be broadcast together with shapes {self.shape} {y.shape}" + ) + ErrorMessage.single_warning( + "np.where method where condition must be broadcast is not yet available in Modin. Defaulting to NumPy." + ) + return array(numpy.where(self._to_numpy(), x, y._to_numpy())) + elif self._ndim == y._ndim: + if not self.shape == y.shape: + raise ValueError( + f"operands could not be broadcast together with shapes {self.shape} {y.shape}" + ) + return array( + _query_compiler=y._query_compiler.where((~self)._query_compiler, x), + _ndim=y._ndim, + ) + else: + ErrorMessage.single_warning( + "np.where method with broadcast is not yet available in Modin. Defaulting to NumPy." + ) + return numpy.where(self._to_numpy(), x, y._to_numpy()) + if not is_scalar(x) and is_scalar(y): + if self._ndim < x._ndim: + if not self.shape[0] == x.shape[1]: + raise ValueError( + f"operands could not be broadcast together with shapes {self.shape} {x.shape}" + ) + ErrorMessage.single_warning( + "np.where method where condition must be broadcast is not yet available in Modin. Defaulting to NumPy." + ) + return array(numpy.where(self._to_numpy(), x._to_numpy(), y)) + elif self._ndim == x._ndim: + if not self.shape == x.shape: + raise ValueError( + f"operands could not be broadcast together with shapes {self.shape} {x.shape}" + ) + return array( + _query_compiler=x._query_compiler.where(self._query_compiler, y), + _ndim=x._ndim, + ) + else: + ErrorMessage.single_warning( + "np.where method with broadcast is not yet available in Modin. Defaulting to NumPy." + ) + return array(numpy.where(self._to_numpy(), x._to_numpy(), y)) + if not (x.shape == y.shape and y.shape == self.shape): + ErrorMessage.single_warning( + "np.where method with broadcast is not yet available in Modin. Defaulting to NumPy." + ) + return array(numpy.where(self._to_numpy(), x._to_numpy(), y._to_numpy())) + return array( + _query_compiler=x._query_compiler.where( + self._query_compiler, y._query_compiler + ), + _ndim=self._ndim, + ) + def __abs__( self, out=None, @@ -205,15 +293,25 @@ def __abs__( absolute = __abs__ + def __invert__(self): + """ + Apply bitwise inverse to each element of the `BasePandasDataset`. + + Returns + ------- + BasePandasDataset + New BasePandasDataset containing bitwise inverse to each value. + """ + if not is_numeric_dtype(self.dtype): + raise TypeError(f"bad operand type for unary ~: '{self.dtype}'") + return array(_query_compiler=self._query_compiler.invert(), _ndim=self._ndim) + def _binary_op(self, other): + other = try_convert_from_interoperable_type(other) if not isinstance(other, array): - if isinstance(other, _INTEROPERABLE_TYPES): - ndim = 2 if isinstance(other, pd.DataFrame) else 1 - other = array(_query_compiler=other._query_compiler, _ndim=ndim) - else: - raise TypeError( - f"Unsupported operand type(s) for divide: '{type(self)}' and '{type(other)}'" - ) + raise TypeError( + f"Unsupported operand type(s): '{type(self)}' and '{type(other)}'" + ) broadcast = self._ndim != other._ndim if broadcast: # In this case, we have a 1D object doing a binary op with a 2D object @@ -247,6 +345,72 @@ def _binary_op(self, other): else: return (self, other, self._ndim, {"broadcast": False}) + def __ge__(self, x2): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.ge(x2), _ndim=self._ndim) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller._query_compiler != self._query_compiler: + # In this case, we are doing an operation that looks like this 1D_object >= 2D_object. + # For Modin to broadcast directly, we have to swap it so that the operation is actually + # 2D_object <= 1D_object. + result = caller._query_compiler.le(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.ge(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + + def __gt__(self, x2): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.gt(x2), _ndim=self._ndim) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller._query_compiler != self._query_compiler: + # In this case, we are doing an operation that looks like this 1D_object > 2D_object. + # For Modin to broadcast directly, we hiave to swap it so that the operation is actually + # 2D_object < 1D_object. + result = caller._query_compiler.lt(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.gt(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + + def __le__(self, x2): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.le(x2), _ndim=self._ndim) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller._query_compiler != self._query_compiler: + # In this case, we are doing an operation that looks like this 1D_object <= 2D_object. + # For Modin to broadcast directly, we have to swap it so that the operation is actually + # 2D_object >= 1D_object. + result = caller._query_compiler.ge(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.le(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + + def __lt__(self, x2): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.lt(x2), _ndim=self._ndim) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + if caller._query_compiler != self._query_compiler: + # In this case, we are doing an operation that looks like this 1D_object < 2D_object. + # For Modin to broadcast directly, we have to swap it so that the operation is actually + # 2D_object > 1D_object. + result = caller._query_compiler.gt(callee._query_compiler, **kwargs) + else: + result = caller._query_compiler.lt(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + + def __eq__(self, x2): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.eq(x2), _ndim=self._ndim) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + result = caller._query_compiler.eq(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + + def __ne__(self, x2): + if is_scalar(x2): + return array(_query_compiler=self._query_compiler.ne(x2), _ndim=self._ndim) + caller, callee, new_ndim, kwargs = self._binary_op(x2) + result = caller._query_compiler.ne(callee._query_compiler, **kwargs) + return array(_query_compiler=result, _ndim=new_ndim) + def __add__( self, x2, @@ -273,7 +437,7 @@ def __radd__( dtype=None, subok=True, ): - return self._add(x2, out, where, casting, order, dtype, subok) + return self.add(x2, out, where, casting, order, dtype, subok) def divide( self, @@ -290,7 +454,7 @@ def divide( _query_compiler=self._query_compiler.truediv(x2), _ndim=self._ndim ) caller, callee, new_ndim, kwargs = self._binary_op(x2) - if caller != self: + if caller._query_compiler != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object/2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.rtruediv(1D_object). @@ -316,7 +480,7 @@ def __rtruediv__( _query_compiler=self._query_compiler.rtruediv(x2), _ndim=self._ndim ) caller, callee, new_ndim, kwargs = self._binary_op(x2) - if caller != self: + if caller._query_compiler != self._query_compiler: result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) else: result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) @@ -340,7 +504,7 @@ def floor_divide( result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) return array(_query_compiler=result, _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) - if caller != self: + if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using # commutativity, and `rfloordiv` also works incorrectly. GH#5529 @@ -369,7 +533,7 @@ def power( if is_scalar(x2): return array(_query_compiler=self._query_compiler.pow(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) - if caller != self: + if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using # commutativity, and `rpow` also works incorrectly. GH#5529 @@ -419,7 +583,7 @@ def __rmul__( dtype=None, subok=True, ): - return self._multiply(x2, out, where, casting, order, dtype, subok) + return self.multiply(x2, out, where, casting, order, dtype, subok) def remainder( self, @@ -441,7 +605,7 @@ def remainder( result._query_compiler = result._query_compiler.replace(numpy.NaN, 0) return result caller, callee, new_ndim, kwargs = self._binary_op(x2) - if caller != self: + if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using # commutativity, and `rmod` also works incorrectly. GH#5529 @@ -470,7 +634,7 @@ def subtract( if is_scalar(x2): return array(_query_compiler=self._query_compiler.sub(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._binary_op(x2) - if caller != self: + if caller._query_compiler != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object - 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.rsub(1D_object). @@ -496,7 +660,7 @@ def __rsub__( _query_compiler=self._query_compiler.rsub(x2), _ndim=self._ndim ) caller, callee, new_ndim, kwargs = self._binary_op(x2) - if caller != self: + if caller._query_compiler != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object - 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.sub(1D_object). @@ -562,6 +726,21 @@ def _set_shape(self, new_shape): shape = property(_get_shape, _set_shape) + def transpose(self, *args, **kwargs): + if self._ndim == 1: + return self + return array(_query_compiler=self._query_compiler.transpose(), _ndim=self._ndim) + + T = property(transpose) + + @property + def dtype(self): + dtype = self._query_compiler.dtypes + if self._ndim == 1: + return dtype[0] + else: + return numpy.result_type(dtype.values) + def __repr__(self): return repr(self._to_numpy()) diff --git a/modin/numpy/math.py b/modin/numpy/math.py index e2cf5f7c5a5..a82e3997d55 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -180,3 +180,14 @@ def true_divide( def ravel(arr, order="C"): if hasattr(arr, "flatten"): return arr.flatten(order) + + +# def where(condition, *, x=None, y=None): +# condition = condition(self) if callable else condition +# if not isinstance(condition, array): +# if isinstance(condition, _INTEROPERABLE_TYPES): +# condition = array(_query_compiler=condition._query_compiler, _ndim = 2 if isinstance(condition, pd.DataFrame) else 1) +# elif is_list_like(condition): +# condition = array(condition) +# if not (is_scalar(condition) or isinstance(condition, (bool, array))): +# raise ValueError(f"np.where does not support conditionals of type {type(condition)}") From bd2fe9845191167938dee746b7573de186a6d0c0 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 2 Feb 2023 18:58:54 -0800 Subject: [PATCH 16/42] Fix df conversion retaining index issue Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 9a298d56b30..cb05893b259 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -32,8 +32,10 @@ def try_convert_from_interoperable_type(obj): if isinstance(obj, _INTEROPERABLE_TYPES): + new_qc = obj._query_compiler.reset_index(drop=True) + new_qc.columns = range(len(new_qc.columns)) obj = array( - _query_compiler=obj._query_compiler, + _query_compiler=new_qc, _ndim=2 if isinstance(obj, pd.DataFrame) else 1, ) return obj From 2a87a3919f63b7253084311cca862f0a43400a34 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Fri, 3 Feb 2023 20:02:22 -0800 Subject: [PATCH 17/42] Add max and min and other numpy methods to namespace Signed-off-by: Rehan Durrani --- modin/numpy/__init__.py | 114 +++++++++- modin/numpy/arr.py | 384 ++++++++++++++++++++++++++++++---- modin/numpy/array_creation.py | 42 ++++ modin/numpy/array_shaping.py | 47 +++++ modin/numpy/math.py | 48 +++-- 5 files changed, 570 insertions(+), 65 deletions(-) create mode 100644 modin/numpy/array_creation.py create mode 100644 modin/numpy/array_shaping.py diff --git a/modin/numpy/__init__.py b/modin/numpy/__init__.py index 75a801f23fc..1dae8dc0c1c 100644 --- a/modin/numpy/__init__.py +++ b/modin/numpy/__init__.py @@ -12,5 +12,115 @@ # governing permissions and limitations under the License. from .arr import array -from .math import * -from .constants import * + +from .array_creation import ( + zeros_like, + ones_like, +) + +from .array_shaping import ( + ravel, + shape, + transpose, +) + +from .math import ( + absolute, + abs, + add, + divide, + float_power, + floor_divide, + power, + prod, + multiply, + remainder, + mod, + subtract, + sum, + true_divide, + mean, + maximum, + amax, + max, + minimum, + amin, + min, +) + +from .constants import ( + Inf, + Infinity, + NAN, + NINF, + NZERO, + NaN, + PINF, + PZERO, + e, + euler_gamma, + inf, + infty, + nan, + newaxis, + pi, +) + + +def where(condition, x=None, y=None): + if condition: + return x + if not condition: + return y + if hasattr(condition, "where"): + return condition.where(x=x, y=y) + raise NotImplementedError( + f"np.where for condition of type {type(condition)} is not yet supported in Modin." + ) + + +__all__ = [ # noqa: F405 + "array", + "zeros_like", + "ones_like", + "ravel", + "shape", + "transpose", + "absolute", + "abs", + "add", + "divide", + "float_power", + "floor_divide", + "power", + "prod", + "multiply", + "remainder", + "mod", + "subtract", + "sum", + "true_divide", + "mean", + "maximum", + "amax", + "max", + "minimum", + "amin", + "min", + "where", + "Inf", + "Infinity", + "NAN", + "NINF", + "NZERO", + "NaN", + "PINF", + "PZERO", + "e", + "euler_gamma", + "inf", + "infty", + "nan", + "newaxis", + "pi", +] diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index cb05893b259..010cd569c56 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -41,6 +41,78 @@ def try_convert_from_interoperable_type(obj): return obj +def check_kwargs(order="C", subok=True, keepdims=None, casting="same_kind", where=True): + if order not in ["K", "C"]: + ErrorMessage.single_warning( + "Array order besides 'C' is not currently supported in Modin. Defaulting to 'C' order." + ) + if not subok: + ErrorMessage.single_warning( + "Subclassing types is not currently supported in Modin. Defaulting to the same base dtype." + ) + if keepdims: + ErrorMessage.single_warning( + "Modin does not yet support broadcasting between nested 1D arrays and 2D arrays." + ) + if casting != "same_kind": + ErrorMessage.single_warning( + "Modin does not yet support the `casting` argument." + ) + if not where and where is not None: + # TODO(RehanSD): Remove this once indexing is merged. + raise NotImplementedError( + "Modin currently does not support the `where` parameter." + ) + + +def check_how_broadcast_to_output(arr_in: "array", arr_out: "array"): + if not isinstance(arr_out, array): + raise TypeError("return arrays must be of modin.numpy.array type.") + if arr_out._ndim == arr_in._ndim and arr_out.shape != arr_in.shape: + raise ValueError( + f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" + ) + elif arr_out._ndim == arr_in._ndim: + return "broadcastable" + elif arr_out._ndim == 1: + if prod(arr_in.shape) == arr_out.shape[0]: + return "flatten" + else: + raise ValueError( + f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" + ) + elif arr_in._ndim == 1: + if prod(arr_out.shape) == arr_in.shape[0]: + return "reshape" + else: + raise ValueError( + f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" + ) + + +def fix_dtypes_and_determine_return(query_compiler_in, _ndim, dtype=None, out=None): + if dtype is not None: + query_compiler_in = query_compiler_in.astype( + {col_name: dtype for col_name in query_compiler_in.columns} + ) + result = array(_query_compiler=query_compiler_in, _ndim=_ndim) + if out is not None: + out = try_convert_from_interoperable_type(out) + broadcast_method = check_how_broadcast_to_output(result, out) + result._query_compiler = result._query_compiler.astype( + {col_name: out.dtype for col_name in result._query_compiler} + ) + if broadcast_method == "broadcastable": + out._query_compiler = result._query_compiler + elif broadcast_method == "flatten": + out._query_compiler = result.flatten()._query_compiler + else: + # TODO(RehanSD): Replace this when reshape is implemented. + raise NotImplementedError("Reshape is currently not supported in Modin.") + return out + return result + + class array(object): """ Modin distributed representation of ``numpy.array``. @@ -155,7 +227,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): out_ndim = len(inputs[0].shape) else: new_ufunc = Binary.register(ufunc) - out_ndim = max([len(inp.shape) for inp in inputs]) + out_ndim = max( + [len(inp.shape) for inp in inputs if hasattr(inp, "shape")] + ) elif method == "reduce": new_ufunc = Reduce.register(ufunc, axis=kwargs.get("axis", None)) if kwargs.get("axis", None) is None: @@ -193,16 +267,23 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return array(_query_compiler=new_ufunc(*args, **kwargs), _ndim=out_ndim) def __array_function__(self, func, types, args, kwargs): - if func.__name__ == "ravel": - return self.flatten() - if func.__name__ == "where": - return self.where(*list(args)[1:]) - return NotImplemented + from . import array_creation as creation, array_shaping as shaping, math + + modin_func = None + if hasattr(math, func): + modin_func = getattr(math, func) + elif hasattr(shaping, func): + modin_func = getattr(shaping, func) + elif hasattr(creation, func): + modin_func = getattr(creation, func) + if modin_func is None: + return NotImplemented + return modin_func(*args, **kwargs) def where(self, x=None, y=None): if x is None and y is None: ErrorMessage.single_warning( - f"np.where method with only condition specified is not yet supported in Modin. Defaulting to NumPy." + "np.where method with only condition specified is not yet supported in Modin. Defaulting to NumPy." ) condition = self._to_numpy() return array(numpy.where(condition)) @@ -281,6 +362,100 @@ def where(self, x=None, y=None): _ndim=self._ndim, ) + def max( + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + ): + check_kwargs(keepdims=keepdims, where=where) + if self._ndim == 1: + if axis == 1: + raise numpy.AxisError(1, 1) + result = self._query_compiler.max(axis=0) + if keepdims: + if initial is not None and result.lt(initial): + result = pd.Series([initial])._query_compiler + return fix_dtypes_and_determine_return(result, 1, dtype, out) + if initial is not None: + result = max(result.to_numpy()[0, 0], initial) + else: + result = result.to_numpy()[0, 0] + return result + if axis is None: + result = self.flatten().max( + axis=axis, + dtype=dtype, + out=out, + keepdims=None, + initial=initial, + where=where, + ) + if keepdims: + result._ndim = self._ndim + return result + result = self._query_compiler.max(axis=axis) + new_ndim = self._ndim - 1 if not keepdims else self._ndim + if new_ndim == 0: + if initial is not None: + result = max(result.to_numpy()[0, 0], initial) + else: + result = result.to_numpy()[0, 0] + return result + intermediate = fix_dtypes_and_determine_return( + result.transpose(), new_ndim, dtype, out + ) + if initial is not None: + intermediate._query_compiler = (intermediate > initial).where( + intermediate, initial + ) + else: + return intermediate + + def min( + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + ): + check_kwargs(keepdims=keepdims, where=where) + if self._ndim == 1: + if axis == 1: + raise numpy.AxisError(1, 1) + result = self._query_compiler.min(axis=0) + if keepdims: + if initial is not None and result.lt(initial): + result = pd.Series([initial])._query_compiler + return fix_dtypes_and_determine_return(result, 1, dtype, out) + if initial is not None: + result = min(result.to_numpy()[0, 0], initial) + else: + result = result.to_numpy()[0, 0] + return result + if axis is None: + result = self.flatten().min( + axis=axis, + dtype=dtype, + out=out, + keepdims=None, + initial=initial, + where=where, + ) + if keepdims: + result._ndim = self._ndim + return result + result = self._query_compiler.min(axis=axis) + new_ndim = self._ndim - 1 if not keepdims else self._ndim + if new_ndim == 0: + if initial is not None: + result = min(result.to_numpy()[0, 0], initial) + else: + result = result.to_numpy()[0, 0] + return result + intermediate = fix_dtypes_and_determine_return( + result.transpose(), new_ndim, dtype, out + ) + if initial is not None: + intermediate._query_compiler = (intermediate < initial).where( + intermediate, initial + ) + else: + return intermediate + def __abs__( self, out=None, @@ -290,7 +465,27 @@ def __abs__( dtype=None, subok=True, ): + check_kwargs(order=order, casting=casting, subok=subok, where=where) result = self._query_compiler.abs() + if dtype is not None: + result = result.astype({col_name: dtype for col_name in result.columns}) + if out is not None: + out = try_convert_from_interoperable_type(out) + broadcast_method = check_how_broadcast_to_output(self, out) + if broadcast_method == "broadcastable": + out._query_compiler = result + return out + elif broadcast_method == "flatten": + out._query_compiler = ( + array(_query_compiler=result, _ndim=self._ndim) + .flatten() + ._query_compiler + ) + else: + # TODO(RehanSD): Replace this when reshape is implemented. + raise NotImplementedError( + "Reshape is currently not supported in Modin." + ) return array(_query_compiler=result, _ndim=self._ndim) absolute = __abs__ @@ -413,6 +608,28 @@ def __ne__(self, x2): result = caller._query_compiler.ne(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) + def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=None): + check_kwargs(keepdims=keepdims, where=where) + if self._ndim == 1: + if axis == 1: + raise numpy.AxisError(1, 1) + result = self._query_compiler.mean(axis=0) + if keepdims: + return fix_dtypes_and_determine_return(result, 1, dtype, out) + return result.to_numpy()[0, 0] + if axis is None: + result = self.flatten().mean( + axis=axis, dtype=dtype, out=out, keepdims=None, where=where + ) + if keepdims: + result._ndim = self._ndim + return result + result = self._query_compiler.mean(axis=axis) + new_ndim = self._ndim - 1 if not keepdims else self._ndim + if new_ndim == 0: + return result.to_numpy()[0, 0] + return fix_dtypes_and_determine_return(result.transpose(), new_ndim, dtype, out) + def __add__( self, x2, @@ -423,11 +640,13 @@ def __add__( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - return array(_query_compiler=self._query_compiler.add(x2), _ndim=self._ndim) + result = self._query_compiler.add(x2) + return fix_dtypes_and_determine_return(result, self._ndim, dtype, out) caller, callee, new_ndim, kwargs = self._binary_op(x2) result = caller._query_compiler.add(callee._query_compiler, **kwargs) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) def __radd__( self, @@ -439,7 +658,7 @@ def __radd__( dtype=None, subok=True, ): - return self.add(x2, out, where, casting, order, dtype, subok) + return self.__add__(x2, out, where, casting, order, dtype, subok) def divide( self, @@ -451,9 +670,10 @@ def divide( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - return array( - _query_compiler=self._query_compiler.truediv(x2), _ndim=self._ndim + return fix_dtypes_and_determine_return( + self._query_compiler.truediv(x2), self._ndim, dtype, out ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: @@ -463,7 +683,7 @@ def divide( result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) else: result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) __truediv__ = divide @@ -477,16 +697,17 @@ def __rtruediv__( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - return array( - _query_compiler=self._query_compiler.rtruediv(x2), _ndim=self._ndim + return fix_dtypes_and_determine_return( + self._query_compiler.rtruediv(x2), self._ndim, dtype, out ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) else: result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) def floor_divide( self, @@ -498,13 +719,14 @@ def floor_divide( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): result = self._query_compiler.floordiv(x2) if x2 == 0: # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) - return array(_query_compiler=result, _ndim=self._ndim) + return fix_dtypes_and_determine_return(result, self._ndim, dtype, out) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is @@ -518,7 +740,7 @@ def floor_divide( # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) __floordiv__ = floor_divide @@ -532,8 +754,11 @@ def power( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - return array(_query_compiler=self._query_compiler.pow(x2), _ndim=self._ndim) + return fix_dtypes_and_determine_return( + self._query_compiler.pow(x2), self._ndim, dtype, out + ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is @@ -543,19 +768,42 @@ def power( "Using power with broadcast is not currently available in Modin." ) result = caller._query_compiler.pow(callee._query_compiler, **kwargs) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) __pow__ = power - def prod(self, axis=None, out=None, keepdims=None, where=None): + def prod( + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + ): + check_kwargs(keepdims=keepdims, where=where) + if self._ndim == 1: + if axis == 1: + raise numpy.AxisError(1, 1) + result = self._query_compiler.prod(axis=0) + if initial is not None: + result = result.mul(initial) + if keepdims: + return fix_dtypes_and_determine_return(result, 1, dtype, out) + return result.to_numpy()[0, 0] if axis is None: - result = self._query_compiler.prod(axis=0).prod(axis=1) + result = self.flatten().prod( + axis=axis, + dtype=dtype, + out=out, + keepdims=None, + initial=initial, + where=where, + ) + if keepdims: + result._ndim = self._ndim + return result + result = self._query_compiler.prod(axis=axis) + if initial is not None: + result = result.mul(initial) + new_ndim = self._ndim - 1 if not keepdims else self._ndim + if new_ndim == 0: return result.to_numpy()[0, 0] - else: - result = self._query_compiler.prod(axis=axis) - if self._ndim == 1: - return result.to_numpy()[0, 0] - return array(_query_compiler=result, _ndim=1) + return fix_dtypes_and_determine_return(result.transpose(), new_ndim, dtype, out) def multiply( self, @@ -567,11 +815,14 @@ def multiply( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - return array(_query_compiler=self._query_compiler.mul(x2), _ndim=self._ndim) + return fix_dtypes_and_determine_return( + self._query_compiler.mul(x2), self._ndim, dtype, out + ) caller, callee, new_ndim, kwargs = self._binary_op(x2) result = caller._query_compiler.mul(callee._query_compiler, **kwargs) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) __mul__ = multiply @@ -597,15 +848,14 @@ def remainder( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - result = array( - _query_compiler=self._query_compiler.mod(x2), _ndim=self._ndim - ) + result = self._query_compiler.mod(x2) if x2 == 0: # NumPy's remainder by 0 works differently from pandas', so we need to fix # the output. - result._query_compiler = result._query_compiler.replace(numpy.NaN, 0) - return result + result = result.replace(numpy.NaN, 0) + return fix_dtypes_and_determine_return(result, self._ndim, dtype, out) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is @@ -619,7 +869,7 @@ def remainder( # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.NaN, 0) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) __mod__ = remainder @@ -633,8 +883,11 @@ def subtract( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - return array(_query_compiler=self._query_compiler.sub(x2), _ndim=self._ndim) + return fix_dtypes_and_determine_return( + self._query_compiler.sub(x2), self._ndim, dtype, out + ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object - 2D_object. @@ -643,7 +896,7 @@ def subtract( result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) else: result = caller._query_compiler.sub(callee._query_compiler, **kwargs) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) __sub__ = subtract @@ -657,9 +910,10 @@ def __rsub__( dtype=None, subok=True, ): + check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - return array( - _query_compiler=self._query_compiler.rsub(x2), _ndim=self._ndim + return fix_dtypes_and_determine_return( + self._query_compiler.rsub(x2), self._ndim, dtype, out ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: @@ -669,20 +923,43 @@ def __rsub__( result = caller._query_compiler.sub(callee._query_compiler, **kwargs) else: result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) def sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None ): + check_kwargs(keepdims=keepdims, where=where) + if self._ndim == 1: + if axis == 1: + raise numpy.AxisError(1, 1) + result = self._query_compiler.sum(axis=0) + if initial is not None: + result = result.add(initial) + if keepdims: + return fix_dtypes_and_determine_return(result, 1, dtype, out) + return result.to_numpy()[0, 0] + if axis is None: + result = self.flatten().sum( + axis=axis, + dtype=dtype, + out=out, + keepdims=keepdims, + initial=initial, + where=where, + ) + if keepdims: + result._ndim = self._ndim + return result result = self._query_compiler.sum(axis=axis) - new_ndim = self._ndim - 1 - if axis is None or new_ndim == 0: + if initial is not None: + result = result.add(initial) + new_ndim = self._ndim - 1 if not keepdims else self._ndim + if new_ndim == 0: return result.to_numpy()[0, 0] - if dtype is not None: - result = result.astype(dtype) - return array(_query_compiler=result, _ndim=new_ndim) + return fix_dtypes_and_determine_return(result.transpose(), new_ndim, dtype, out) def flatten(self, order="C"): + check_kwargs(order=order) qcs = [ self._query_compiler.getitem_row_array([index_val]).reset_index(drop=True) for index_val in self._query_compiler.index[1:] @@ -693,6 +970,7 @@ def flatten(self, order="C"): .concat(1, qcs, ignore_index=True) ) new_query_compiler.columns = range(len(new_query_compiler.columns)) + new_query_compiler = new_query_compiler.transpose() new_ndim = 1 return array(_query_compiler=new_query_compiler, _ndim=new_ndim) @@ -728,7 +1006,7 @@ def _set_shape(self, new_shape): shape = property(_get_shape, _set_shape) - def transpose(self, *args, **kwargs): + def transpose(self): if self._ndim == 1: return self return array(_query_compiler=self._query_compiler.transpose(), _ndim=self._ndim) @@ -743,6 +1021,20 @@ def dtype(self): else: return numpy.result_type(dtype.values) + def astype(self, dtype, order="K", casting="unsafe", subok=True, copy=True): + if casting != "unsafe": + raise ValueError( + "Modin does not support `astype` with `casting != unsafe`." + ) + check_kwargs(order=order, subok=subok) + result = self._query_compiler.astype( + {col_name: dtype for col_name in self._query_compiler.columns} + ) + if copy: + self._query_compiler = result + return self + return array(_query_compiler=result, _ndim=self._ndim) + def __repr__(self): return repr(self._to_numpy()) diff --git a/modin/numpy/array_creation.py b/modin/numpy/array_creation.py new file mode 100644 index 00000000000..33bfb156bf4 --- /dev/null +++ b/modin/numpy/array_creation.py @@ -0,0 +1,42 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module houses array creation methods for Modin's NumPy API.""" +import numpy +from modin.error_message import ErrorMessage +from .arr import array + + +def _create_array(dtype, shape, order, subok, numpy_method): + if order not in ["K", "C"]: + ErrorMessage.single_warning( + "Array order besides 'C' is not currently supported in Modin. Defaulting to 'C' order." + ) + if not subok: + ErrorMessage.single_warning( + "Subclassing types is not currently supported in Modin. Defaulting to the same base dtype." + ) + ErrorMessage.single_warning(f"np.{numpy_method}_like defaulting to NumPy.") + return array(getattr(numpy, numpy_method)(shape, dtype=dtype)) + + +def zeros_like(a, dtype=None, order="K", subok=True, shape=None): + dtype = a.dtype if dtype is None else dtype + shape = a.shape if shape is None else shape + return _create_array(dtype, shape, order, subok, "zeros") + + +def ones_like(a, dtype=None, order="K", subok=True, shape=None): + dtype = a.dtype if dtype is None else dtype + shape = a.shape if shape is None else shape + return _create_array(dtype, shape, order, subok, "ones") diff --git a/modin/numpy/array_shaping.py b/modin/numpy/array_shaping.py new file mode 100644 index 00000000000..e59cc609666 --- /dev/null +++ b/modin/numpy/array_shaping.py @@ -0,0 +1,47 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module houses array shaping methods for Modin's NumPy API.""" +from modin.error_message import ErrorMessage + + +def ravel(a, order="C"): + if order != "C": + ErrorMessage.single_warning( + "Array order besides 'C' is not currently supported in Modin. Defaulting to 'C' order." + ) + if hasattr(a, "flatten"): + return a.flatten(order) + raise NotImplementedError( + f"Object of type {type(a)} does not have a flatten method to use for raveling." + ) + + +def shape(a): + if hasattr(a, "shape"): + return a.shape + raise NotImplementedError( + f"Object of type {type(a)} does not have a shape property." + ) + + +def transpose(a, axes=None): + if axes is not None: + raise NotImplementedError( + f"Modin does not support arrays higher than 2-dimensions. Please use `transpose` with `axis=None` on a 2-dimensional or lower object." + ) + if hasattr(a, "transpose"): + return a.transpose() + raise NotImplementedError( + f"Object of type {type(a)} does not have a transpose method." + ) diff --git a/modin/numpy/math.py b/modin/numpy/math.py index a82e3997d55..f38162bef34 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import numpy + def absolute( x, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True @@ -39,11 +41,6 @@ def add( ) -def all(a, axis=None, out=None, keepdims=None, where=None): - if hasattr(a, "all"): - return a.all(axis=axis, out=out, keepdims=keepdims, where=where) - - def divide( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): @@ -177,17 +174,34 @@ def true_divide( ) -def ravel(arr, order="C"): - if hasattr(arr, "flatten"): - return arr.flatten(order) +def mean(x1, axis=None, dtype=None, out=None, keepdims=None, *, where=None): + if hasattr(x1, "mean"): + return x1.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where) + + +# Maximum and minimum are ufunc's in NumPy, which means that our array's __array_ufunc__ +# implementation will automatically handle this, so we can just use NumPy's maximum/minimum +# since that will route to our array's ufunc. +maximum = numpy.maximum + +minimum = numpy.minimum + + +def amax(x1, axis=None, out=None, keepdims=None, initial=None, where=None): + if hasattr(x1, "max"): + return x1.max( + axis=axis, out=out, keepdims=keepdims, initial=initial, where=where + ) + + +max = amax + + +def amin(x1, axis=None, out=None, keepdims=None, initial=None, where=None): + if hasattr(x1, "min"): + return x1.min( + axis=axis, out=out, keepdims=keepdims, initial=initial, where=where + ) -# def where(condition, *, x=None, y=None): -# condition = condition(self) if callable else condition -# if not isinstance(condition, array): -# if isinstance(condition, _INTEROPERABLE_TYPES): -# condition = array(_query_compiler=condition._query_compiler, _ndim = 2 if isinstance(condition, pd.DataFrame) else 1) -# elif is_list_like(condition): -# condition = array(condition) -# if not (is_scalar(condition) or isinstance(condition, (bool, array))): -# raise ValueError(f"np.where does not support conditionals of type {type(condition)}") +min = amin From d513b03084c8f13e35174355e15734cae4ad1359 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sat, 4 Feb 2023 17:50:40 -0800 Subject: [PATCH 18/42] Fix dtype handling Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 010cd569c56..2b79b8dc677 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -113,6 +113,17 @@ def fix_dtypes_and_determine_return(query_compiler_in, _ndim, dtype=None, out=No return result +def find_common_dtype(dtypes): + if len(dtypes) == 1: + return dtypes[0] + elif len(dtypes) == 2: + return numpy.promote_types(*dtypes) + midpoint = len(dtypes) // 2 + return numpy.promote_types( + find_common_dtype(dtypes[:midpoint]), find_common_dtype(dtypes[midpoint:]) + ) + + class array(object): """ Modin distributed representation of ``numpy.array``. @@ -142,10 +153,14 @@ def __init__( if _query_compiler is not None: self._query_compiler = _query_compiler self._ndim = _ndim + new_dtype = find_common_dtype( + numpy.unique(self._query_compiler.dtypes.values) + ) elif is_list_like(object) and not is_list_like(object[0]): series = pd.Series(object) self._query_compiler = series._query_compiler self._ndim = 1 + new_dtype = self._query_compiler.dtypes.values[0] else: target_kwargs = { "dtype": None, @@ -168,6 +183,7 @@ def __init__( ) self._query_compiler = pd.DataFrame(arr)._query_compiler + new_dtype = arr.dtype # These two lines are necessary so that our query compiler does not keep track of indices # and try to map like indices to like indices. (e.g. if we multiply two arrays that used # to be dataframes, and the dataframes had the same column names but ordered differently @@ -175,6 +191,10 @@ def __init__( # than pair columns with the same name and multiply them.) self._query_compiler = self._query_compiler.reset_index(drop=True) self._query_compiler.columns = range(len(self._query_compiler.columns)) + new_dtype = new_dtype if dtype is None else dtype + self._query_compiler = self._query_compiler.astype( + {col_name: new_dtype for col_name in self._query_compiler.columns} + ) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): ufunc_name = ufunc.__name__ @@ -1019,7 +1039,7 @@ def dtype(self): if self._ndim == 1: return dtype[0] else: - return numpy.result_type(dtype.values) + return find_common_dtype(dtype.values) def astype(self, dtype, order="K", casting="unsafe", subok=True, copy=True): if casting != "unsafe": From d8d0d10dfe2d3274e1c656c648cd3e22ba5d6f70 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 00:04:38 -0800 Subject: [PATCH 19/42] Fix keepdims Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 61 ++++++++++++++++++++++++++++++++------------- modin/numpy/math.py | 10 ++++---- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 2b79b8dc677..9f3e362cdb6 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -383,16 +383,22 @@ def where(self, x=None, y=None): ) def max( - self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): check_kwargs(keepdims=keepdims, where=where) + if initial is None and where is not True: + raise ValueError( + "reduction operation 'maximum' does not have an identity, so to use a where mask one has to specify 'initial'" + ) if self._ndim == 1: if axis == 1: raise numpy.AxisError(1, 1) result = self._query_compiler.max(axis=0) if keepdims: if initial is not None and result.lt(initial): - result = pd.Series([initial])._query_compiler + result = array( + _query_compiler=pd.Series([initial])._query_compiler, _ndim=1 + ) return fix_dtypes_and_determine_return(result, 1, dtype, out) if initial is not None: result = max(result.to_numpy()[0, 0], initial) @@ -409,7 +415,7 @@ def max( where=where, ) if keepdims: - result._ndim = self._ndim + return array(numpy.array([[result]])) return result result = self._query_compiler.max(axis=axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim @@ -419,9 +425,9 @@ def max( else: result = result.to_numpy()[0, 0] return result - intermediate = fix_dtypes_and_determine_return( - result.transpose(), new_ndim, dtype, out - ) + if not keepdims and axis != 1: + result = result.transpose() + intermediate = fix_dtypes_and_determine_return(result, new_ndim, dtype, out) if initial is not None: intermediate._query_compiler = (intermediate > initial).where( intermediate, initial @@ -430,16 +436,22 @@ def max( return intermediate def min( - self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): check_kwargs(keepdims=keepdims, where=where) + if initial is None and where is not True: + raise ValueError( + "reduction operation 'minimum' does not have an identity, so to use a where mask one has to specify 'initial'" + ) if self._ndim == 1: if axis == 1: raise numpy.AxisError(1, 1) result = self._query_compiler.min(axis=0) if keepdims: if initial is not None and result.lt(initial): - result = pd.Series([initial])._query_compiler + result = array( + _query_compiler=pd.Series([initial])._query_compiler, _ndim=1 + ) return fix_dtypes_and_determine_return(result, 1, dtype, out) if initial is not None: result = min(result.to_numpy()[0, 0], initial) @@ -456,6 +468,7 @@ def min( where=where, ) if keepdims: + result = array(numpy.array([[result]])) result._ndim = self._ndim return result result = self._query_compiler.min(axis=axis) @@ -465,9 +478,14 @@ def min( result = min(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] - return result + return result if where else initial + if not keepdims and axis != 1: + result = result.transpose() intermediate = fix_dtypes_and_determine_return( - result.transpose(), new_ndim, dtype, out + result, + new_ndim, + dtype, + out, ) if initial is not None: intermediate._query_compiler = (intermediate < initial).where( @@ -628,7 +646,7 @@ def __ne__(self, x2): result = caller._query_compiler.ne(callee._query_compiler, **kwargs) return array(_query_compiler=result, _ndim=new_ndim) - def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=None): + def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: if axis == 1: @@ -642,13 +660,15 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=None): axis=axis, dtype=dtype, out=out, keepdims=None, where=where ) if keepdims: - result._ndim = self._ndim + result = array(numpy.array([[result]])) return result result = self._query_compiler.mean(axis=axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] - return fix_dtypes_and_determine_return(result.transpose(), new_ndim, dtype, out) + if not keepdims and axis != 1: + result = result.transpose() + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) def __add__( self, @@ -793,7 +813,7 @@ def power( __pow__ = power def prod( - self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: @@ -815,7 +835,7 @@ def prod( where=where, ) if keepdims: - result._ndim = self._ndim + result = array(numpy.array([[result]])) return result result = self._query_compiler.prod(axis=axis) if initial is not None: @@ -823,7 +843,9 @@ def prod( new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] - return fix_dtypes_and_determine_return(result.transpose(), new_ndim, dtype, out) + if not keepdims and axis != 1: + result = result.transpose() + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) def multiply( self, @@ -946,7 +968,7 @@ def __rsub__( return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) def sum( - self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None + self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: @@ -968,6 +990,7 @@ def sum( where=where, ) if keepdims: + result = array(numpy.array([[result]])) result._ndim = self._ndim return result result = self._query_compiler.sum(axis=axis) @@ -976,7 +999,9 @@ def sum( new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] - return fix_dtypes_and_determine_return(result.transpose(), new_ndim, dtype, out) + if not keepdims and axis != 1: + result = result.transpose() + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) def flatten(self, order="C"): check_kwargs(order=order) diff --git a/modin/numpy/math.py b/modin/numpy/math.py index f38162bef34..a2c134ff104 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -101,7 +101,7 @@ def power( ) -def prod(a, axis=None, out=None, keepdims=None, where=None): +def prod(a, axis=None, out=None, keepdims=None, where=True): if hasattr(a, "prod"): return a.prod(axis=axis, out=out, keepdims=keepdims, where=where) @@ -154,7 +154,7 @@ def subtract( ) -def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None): +def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True): if hasattr(arr, "sum"): return arr.sum(axis) @@ -174,7 +174,7 @@ def true_divide( ) -def mean(x1, axis=None, dtype=None, out=None, keepdims=None, *, where=None): +def mean(x1, axis=None, dtype=None, out=None, keepdims=None, *, where=True): if hasattr(x1, "mean"): return x1.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where) @@ -187,7 +187,7 @@ def mean(x1, axis=None, dtype=None, out=None, keepdims=None, *, where=None): minimum = numpy.minimum -def amax(x1, axis=None, out=None, keepdims=None, initial=None, where=None): +def amax(x1, axis=None, out=None, keepdims=None, initial=None, where=True): if hasattr(x1, "max"): return x1.max( axis=axis, out=out, keepdims=keepdims, initial=initial, where=where @@ -197,7 +197,7 @@ def amax(x1, axis=None, out=None, keepdims=None, initial=None, where=None): max = amax -def amin(x1, axis=None, out=None, keepdims=None, initial=None, where=None): +def amin(x1, axis=None, out=None, keepdims=None, initial=None, where=True): if hasattr(x1, "min"): return x1.min( axis=axis, out=out, keepdims=keepdims, initial=initial, where=where From 7404cb3ead26f2a90b90cf5ff1f23bb0112fa6ca Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 00:17:52 -0800 Subject: [PATCH 20/42] Fix out and add Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 2 +- modin/numpy/math.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 9f3e362cdb6..d1e54435363 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -100,7 +100,7 @@ def fix_dtypes_and_determine_return(query_compiler_in, _ndim, dtype=None, out=No out = try_convert_from_interoperable_type(out) broadcast_method = check_how_broadcast_to_output(result, out) result._query_compiler = result._query_compiler.astype( - {col_name: out.dtype for col_name in result._query_compiler} + {col_name: out.dtype for col_name in result._query_compiler.columns} ) if broadcast_method == "broadcastable": out._query_compiler = result._query_compiler diff --git a/modin/numpy/math.py b/modin/numpy/math.py index a2c134ff104..9d50e45ac53 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -29,8 +29,8 @@ def absolute( def add( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "add"): - return x1.add( + if hasattr(x1, "__add__"): + return x1.__add__( x2, out=out, where=where, From 0d3be9331e610d7a9e45a2bb80e743267fa7e970 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 00:28:47 -0800 Subject: [PATCH 21/42] Add support for where kwarg Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 122 ++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index d1e54435363..c756e6e8ccb 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -14,7 +14,7 @@ from math import prod import numpy -from pandas.core.dtypes.common import is_list_like, is_numeric_dtype +from pandas.core.dtypes.common import is_list_like, is_numeric_dtype, is_bool_dtype from pandas.api.types import is_scalar from inspect import signature @@ -58,10 +58,15 @@ def check_kwargs(order="C", subok=True, keepdims=None, casting="same_kind", wher ErrorMessage.single_warning( "Modin does not yet support the `casting` argument." ) - if not where and where is not None: - # TODO(RehanSD): Remove this once indexing is merged. - raise NotImplementedError( - "Modin currently does not support the `where` parameter." + if not ( + is_scalar(where) or (isinstance(where, array) and is_bool_dtype(where.dtype)) + ): + if not isinstance(where, array): + raise NotImplementedError( + f"Modin only supports scalar or modin.numpy.array `where` parameter, not `where` parameter of type {type(where)}" + ) + raise TypeError( + f"Cannot cast array data from {where.dtype} to dtype('bool') according to the rule 'safe'" ) @@ -90,7 +95,9 @@ def check_how_broadcast_to_output(arr_in: "array", arr_out: "array"): ) -def fix_dtypes_and_determine_return(query_compiler_in, _ndim, dtype=None, out=None): +def fix_dtypes_and_determine_return( + query_compiler_in, _ndim, dtype=None, out=None, where=True +): if dtype is not None: query_compiler_in = query_compiler_in.astype( {col_name: dtype for col_name in query_compiler_in.columns} @@ -102,14 +109,26 @@ def fix_dtypes_and_determine_return(query_compiler_in, _ndim, dtype=None, out=No result._query_compiler = result._query_compiler.astype( {col_name: out.dtype for col_name in result._query_compiler.columns} ) - if broadcast_method == "broadcastable": - out._query_compiler = result._query_compiler - elif broadcast_method == "flatten": - out._query_compiler = result.flatten()._query_compiler - else: + if broadcast_method == "flatten": + result = result.flatten() + elif broadcast_method != "broadcastable": # TODO(RehanSD): Replace this when reshape is implemented. raise NotImplementedError("Reshape is currently not supported in Modin.") + if isinstance(where, array): + out._query_compiler = where.where(result, out)._query_compiler + elif where: + out._query_compiler = result._query_compiler return out + if isinstance(where, array) and out is None: + from array_creation import zeros_like + + out = zeros_like(result) + out._query_compiler = where.where(result, out)._query_compiler + return out + elif not where: + from .array_creation import zeros_like + + return zeros_like(result) return result @@ -399,12 +418,12 @@ def max( result = array( _query_compiler=pd.Series([initial])._query_compiler, _ndim=1 ) - return fix_dtypes_and_determine_return(result, 1, dtype, out) + return fix_dtypes_and_determine_return(result, 1, dtype, out, where) if initial is not None: result = max(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] - return result + return result if where else initial if axis is None: result = self.flatten().max( axis=axis, @@ -424,7 +443,7 @@ def max( result = max(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] - return result + return result if where else initial if not keepdims and axis != 1: result = result.transpose() intermediate = fix_dtypes_and_determine_return(result, new_ndim, dtype, out) @@ -452,12 +471,12 @@ def min( result = array( _query_compiler=pd.Series([initial])._query_compiler, _ndim=1 ) - return fix_dtypes_and_determine_return(result, 1, dtype, out) + return fix_dtypes_and_determine_return(result, 1, dtype, out, where) if initial is not None: result = min(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] - return result + return result if where else initial if axis is None: result = self.flatten().min( axis=axis, @@ -482,10 +501,7 @@ def min( if not keepdims and axis != 1: result = result.transpose() intermediate = fix_dtypes_and_determine_return( - result, - new_ndim, - dtype, - out, + result, new_ndim, dtype, out, where ) if initial is not None: intermediate._query_compiler = (intermediate < initial).where( @@ -653,8 +669,8 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): raise numpy.AxisError(1, 1) result = self._query_compiler.mean(axis=0) if keepdims: - return fix_dtypes_and_determine_return(result, 1, dtype, out) - return result.to_numpy()[0, 0] + return fix_dtypes_and_determine_return(result, 1, dtype, out, where) + return result.to_numpy()[0, 0] if where else numpy.nan if axis is None: result = self.flatten().mean( axis=axis, dtype=dtype, out=out, keepdims=None, where=where @@ -665,10 +681,10 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): result = self._query_compiler.mean(axis=axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: - return result.to_numpy()[0, 0] + return result.to_numpy()[0, 0] if where else numpy.nan if not keepdims and axis != 1: result = result.transpose() - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __add__( self, @@ -683,10 +699,12 @@ def __add__( check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): result = self._query_compiler.add(x2) - return fix_dtypes_and_determine_return(result, self._ndim, dtype, out) + return fix_dtypes_and_determine_return( + result, self._ndim, dtype, out, where + ) caller, callee, new_ndim, kwargs = self._binary_op(x2) result = caller._query_compiler.add(callee._query_compiler, **kwargs) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __radd__( self, @@ -713,7 +731,7 @@ def divide( check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.truediv(x2), self._ndim, dtype, out + self._query_compiler.truediv(x2), self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: @@ -723,7 +741,7 @@ def divide( result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) else: result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __truediv__ = divide @@ -740,14 +758,14 @@ def __rtruediv__( check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.rtruediv(x2), self._ndim, dtype, out + self._query_compiler.rtruediv(x2), self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) else: result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def floor_divide( self, @@ -766,7 +784,9 @@ def floor_divide( # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) - return fix_dtypes_and_determine_return(result, self._ndim, dtype, out) + return fix_dtypes_and_determine_return( + result, self._ndim, dtype, out, where + ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is @@ -780,7 +800,7 @@ def floor_divide( # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __floordiv__ = floor_divide @@ -797,7 +817,7 @@ def power( check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.pow(x2), self._ndim, dtype, out + self._query_compiler.pow(x2), self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: @@ -808,7 +828,7 @@ def power( "Using power with broadcast is not currently available in Modin." ) result = caller._query_compiler.pow(callee._query_compiler, **kwargs) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __pow__ = power @@ -823,8 +843,8 @@ def prod( if initial is not None: result = result.mul(initial) if keepdims: - return fix_dtypes_and_determine_return(result, 1, dtype, out) - return result.to_numpy()[0, 0] + return fix_dtypes_and_determine_return(result, 1, dtype, out, where) + return result.to_numpy()[0, 0] if where else 1 if axis is None: result = self.flatten().prod( axis=axis, @@ -842,10 +862,10 @@ def prod( result = result.mul(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: - return result.to_numpy()[0, 0] + return result.to_numpy()[0, 0] if where else 1 if not keepdims and axis != 1: result = result.transpose() - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def multiply( self, @@ -860,11 +880,11 @@ def multiply( check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.mul(x2), self._ndim, dtype, out + self._query_compiler.mul(x2), self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) result = caller._query_compiler.mul(callee._query_compiler, **kwargs) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __mul__ = multiply @@ -897,7 +917,9 @@ def remainder( # NumPy's remainder by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.NaN, 0) - return fix_dtypes_and_determine_return(result, self._ndim, dtype, out) + return fix_dtypes_and_determine_return( + result, self._ndim, dtype, out, where + ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is @@ -911,7 +933,7 @@ def remainder( # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.NaN, 0) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __mod__ = remainder @@ -928,7 +950,7 @@ def subtract( check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.sub(x2), self._ndim, dtype, out + self._query_compiler.sub(x2), self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: @@ -938,7 +960,7 @@ def subtract( result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) else: result = caller._query_compiler.sub(callee._query_compiler, **kwargs) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __sub__ = subtract @@ -955,7 +977,7 @@ def __rsub__( check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.rsub(x2), self._ndim, dtype, out + self._query_compiler.rsub(x2), self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) if caller._query_compiler != self._query_compiler: @@ -965,7 +987,7 @@ def __rsub__( result = caller._query_compiler.sub(callee._query_compiler, **kwargs) else: result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True @@ -978,8 +1000,8 @@ def sum( if initial is not None: result = result.add(initial) if keepdims: - return fix_dtypes_and_determine_return(result, 1, dtype, out) - return result.to_numpy()[0, 0] + return fix_dtypes_and_determine_return(result, 1, dtype, out, where) + return result.to_numpy()[0, 0] if where else 0 if axis is None: result = self.flatten().sum( axis=axis, @@ -998,10 +1020,10 @@ def sum( result = result.add(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: - return result.to_numpy()[0, 0] + return result.to_numpy()[0, 0] if where else 0 if not keepdims and axis != 1: result = result.transpose() - return fix_dtypes_and_determine_return(result, new_ndim, dtype, out) + return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def flatten(self, order="C"): check_kwargs(order=order) From 508ecb3f9a8b5efca402352c8c6e98b000e112cf Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 00:30:32 -0800 Subject: [PATCH 22/42] Fix lint Signed-off-by: Rehan Durrani --- modin/numpy/array_shaping.py | 2 +- modin/numpy/test/test_array.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modin/numpy/array_shaping.py b/modin/numpy/array_shaping.py index e59cc609666..8b01f0f5e8a 100644 --- a/modin/numpy/array_shaping.py +++ b/modin/numpy/array_shaping.py @@ -38,7 +38,7 @@ def shape(a): def transpose(a, axes=None): if axes is not None: raise NotImplementedError( - f"Modin does not support arrays higher than 2-dimensions. Please use `transpose` with `axis=None` on a 2-dimensional or lower object." + "Modin does not support arrays higher than 2-dimensions. Please use `transpose` with `axis=None` on a 2-dimensional or lower object." ) if hasattr(a, "transpose"): return a.transpose() diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index e3b963a2290..944ca95b6ef 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -90,36 +90,36 @@ def test_scalar_arithmetic(size): modin_arr = np.array(numpy_arr) scalar = numpy.random.randint(1, 100) numpy.testing.assert_array_equal( - (scalar * modin_arr)._to_numpy(), scalar * numpy_arr, err_msg=f"__mul__ failed." + (scalar * modin_arr)._to_numpy(), scalar * numpy_arr, err_msg="__mul__ failed." ) numpy.testing.assert_array_equal( (modin_arr * scalar)._to_numpy(), scalar * numpy_arr, - err_msg=f"__rmul__ failed.", + err_msg="__rmul__ failed.", ) numpy.testing.assert_array_equal( (scalar / modin_arr)._to_numpy(), scalar / numpy_arr, - err_msg=f"__rtruediv__ failed.", + err_msg="__rtruediv__ failed.", ) numpy.testing.assert_array_equal( (modin_arr / scalar)._to_numpy(), numpy_arr / scalar, - err_msg=f"__truediv__ failed.", + err_msg="__truediv__ failed.", ) numpy.testing.assert_array_equal( (scalar + modin_arr)._to_numpy(), scalar + numpy_arr, - err_msg=f"__radd__ failed.", + err_msg="__radd__ failed.", ) numpy.testing.assert_array_equal( - (modin_arr + scalar)._to_numpy(), scalar + numpy_arr, err_msg=f"__add__ failed." + (modin_arr + scalar)._to_numpy(), scalar + numpy_arr, err_msg="__add__ failed." ) numpy.testing.assert_array_equal( (scalar - modin_arr)._to_numpy(), scalar - numpy_arr, - err_msg=f"__rsub__ failed.", + err_msg="__rsub__ failed.", ) numpy.testing.assert_array_equal( - (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg=f"__sub__ failed." + (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg="__sub__ failed." ) From 90aaed7b754ab188d7d126c8ebb27419bef531db Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 12:09:05 -0800 Subject: [PATCH 23/42] Get tests to run Signed-off-by: Rehan Durrani --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0749a141cd8..a6e74d64ec0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -623,6 +623,7 @@ jobs: - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_groupby.py - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_reshape.py - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_general.py + - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array.py - run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh - run: ./.github/workflows/sql_server/set_up_sql_server.sh - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_io.py --verbose @@ -710,6 +711,7 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/test_series.py - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py - run: python -m pytest -n 2 modin/pandas/test/test_concat.py + - run: python -m pytest -n 2 modin/numpy/test/test_array.py if: matrix.engine == 'python' - run: python -m pytest modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2 if: matrix.engine != 'python' @@ -842,6 +844,7 @@ jobs: - modin/pandas/test/test_reshape.py - modin/pandas/test/test_general.py - modin/pandas/test/test_io.py + - modin/numpy/test/test_array.py env: MODIN_ENGINE: ${{matrix.engine}} name: test-windows From 88aa6b5cea51bb9513cdb6ae69acb384dee4a8b6 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 13:34:50 -0800 Subject: [PATCH 24/42] Add testing for array ufunc Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 18 ++++++++++-------- modin/numpy/test/test_array.py | 23 ++++++++++++++++++++++- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index c756e6e8ccb..997a8881f4e 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -236,10 +236,10 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): [ param for param in signature(function).parameters.values() - if param.kind == param.POSITIONAL_ONLY + if param.default == param.empty ] ) - if len_expected_arguments == len(args): + if len_expected_arguments == (len(args) - 1) and method == "__call__": return function(*tuple(args[1:]), **kwargs) else: ErrorMessage.single_warning( @@ -254,7 +254,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if isinstance(input, pd.Series): input = input._query_compiler.to_numpy().flatten() args += [input] - output = args[0].__array_ufunc__(ufunc, method, *args, **kwargs) + output = self._to_numpy().__array_ufunc__(ufunc, method, *args, **kwargs) if is_scalar(output): return output return array(output) @@ -270,13 +270,15 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): [len(inp.shape) for inp in inputs if hasattr(inp, "shape")] ) elif method == "reduce": - new_ufunc = Reduce.register(ufunc, axis=kwargs.get("axis", None)) + if len(inputs) == 1: + new_ufunc = Reduce.register(ufunc, axis=kwargs.get("axis", None)) if kwargs.get("axis", None) is None: out_ndim = 0 else: out_ndim = len(inputs[0].shape) - 1 elif method == "accumulate": - new_ufunc = Reduce.register(ufunc, axis=None) + if len(inputs) == 1: + new_ufunc = Reduce.register(ufunc, axis=None) out_ndim = 0 if new_ufunc is None: ErrorMessage.single_warning( @@ -291,7 +293,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if isinstance(input, pd.Series): input = input._query_compiler.to_numpy().flatten() args += [input] - output = ufunc(*args, **kwargs) + output = self._to_numpy().__array_ufunc__(ufunc, method, *args, **kwargs) if is_scalar(output): return output return array(output) @@ -796,7 +798,7 @@ def floor_divide( "Using floor_divide with broadcast is not currently available in Modin." ) result = caller._query_compiler.floordiv(callee._query_compiler, **kwargs) - if any(callee._query_compiler.eq(0).any()): + if callee._query_compiler.eq(0).any(): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) @@ -929,7 +931,7 @@ def remainder( "Using remainder with broadcast is not currently available in Modin." ) result = caller._query_compiler.mod(callee._query_compiler, **kwargs) - if any(callee._query_compiler.eq(0).any()): + if callee._query_compiler.eq(0).any(): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.NaN, 0) diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index 944ca95b6ef..2fbe2f5fbd0 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -13,6 +13,7 @@ import numpy import pytest +import warnings import modin.numpy as np @@ -68,7 +69,7 @@ def test_basic_arithmetic_with_broadcast(operand1shape, operand2shape, operator) @pytest.mark.parametrize("operator", ["__pow__", "__floordiv__", "__mod__"]) -def test_complex_arithmetic(operator): +def test_arithmetic(operator): """Test of operators that do not yet support broadcasting""" for size, textdim in ((100, "1D"), ((10, 10), "2D")): operand1 = numpy.random.randint(-100, 100, size=size) @@ -123,3 +124,23 @@ def test_scalar_arithmetic(size): numpy.testing.assert_array_equal( (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg="__sub__ failed." ) + +@pytest.mark.filterwarnings("ignore:Distributing Date: Sun, 5 Feb 2023 13:42:21 -0800 Subject: [PATCH 25/42] Add testing for array function Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 14 +++++++------- modin/numpy/test/test_array.py | 22 +++++++++++++++++++++- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 997a8881f4e..df801e17da6 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -309,14 +309,14 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def __array_function__(self, func, types, args, kwargs): from . import array_creation as creation, array_shaping as shaping, math - + func_name = func.__name__ modin_func = None - if hasattr(math, func): - modin_func = getattr(math, func) - elif hasattr(shaping, func): - modin_func = getattr(shaping, func) - elif hasattr(creation, func): - modin_func = getattr(creation, func) + if hasattr(math, func_name): + modin_func = getattr(math, func_name) + elif hasattr(shaping, func_name): + modin_func = getattr(shaping, func_name) + elif hasattr(creation, func_name): + modin_func = getattr(creation, func_name) if modin_func is None: return NotImplemented return modin_func(*args, **kwargs) diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index 2fbe2f5fbd0..d13bf210052 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -125,7 +125,6 @@ def test_scalar_arithmetic(size): (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg="__sub__ failed." ) -@pytest.mark.filterwarnings("ignore:Distributing Date: Sun, 5 Feb 2023 13:54:29 -0800 Subject: [PATCH 26/42] Add testing for where Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 2 +- modin/numpy/test/test_array.py | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index df801e17da6..87d7508b794 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -342,7 +342,7 @@ def where(self, x=None, y=None): ErrorMessage.single_warning( "np.where not supported when both x and y are scalars. Defaulting to NumPy." ) - return array(numpy.where(self._query_compiler.to_pandas(), x, y)) + return array(numpy.where(self._to_numpy(), x, y)) if is_scalar(x) and not is_scalar(y): if self._ndim < y._ndim: if not self.shape[0] == y.shape[1]: diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index d13bf210052..e938ad4c4ba 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -161,6 +161,34 @@ def test_array_function(size): numpy_result = numpy.sum(numpy_arr) assert numpy_result == modin_result -# def test_array_where(): -# numpy_flat_arr = numpy.random.randint(-100, 100, size=100) -# modin_flat_arr = np.array(numpy_flat_arr) +def test_array_where(): + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) + modin_flat_arr = np.array(numpy_flat_arr) + with pytest.warns(UserWarning, match="np.where method with only condition specified"): + warnings.filterwarnings("ignore", message="Distributing") + modin_flat_arr.where() + with pytest.raises(ValueError, match="np.where requires x and y"): + modin_flat_arr.where(x=["Should Fail."]) + with pytest.warns(UserWarning, match="np.where not supported when both x and y"): + warnings.filterwarnings("ignore", message="Distributing") + modin_result = modin_flat_arr.where(x=4, y=5) + numpy_result = numpy.where(numpy_flat_arr, 4, 5) + numpy.testing.assert_array_equal(numpy_result, modin_result._to_numpy()) + modin_flat_bool_arr = modin_flat_arr <= 0 + numpy_flat_bool_arr = numpy_flat_arr <= 0 + modin_result = modin_flat_bool_arr.where(x=5, y=modin_flat_arr) + numpy_result = numpy.where(numpy_flat_bool_arr, 5, numpy_flat_arr) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_flat_bool_arr.where(x=modin_flat_arr, y=5) + numpy_result = numpy.where(numpy_flat_bool_arr, numpy_flat_arr, 5) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_flat_bool_arr.where(x=modin_flat_arr, y=(-1 * modin_flat_arr)) + numpy_result = numpy.where(numpy_flat_bool_arr, numpy_flat_arr, (-1 * numpy_flat_arr)) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + modin_bool_arr = modin_arr > 0 + numpy_bool_arr = numpy_arr > 0 + modin_result = modin_bool_arr.where(modin_arr, 10*modin_arr) + numpy_result = numpy.where(numpy_bool_arr, numpy_arr, 10*numpy_arr) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) From f176ac81afd89a300b25bfff07429aea58ce6c0a Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 14:24:21 -0800 Subject: [PATCH 27/42] Add tests for everything but prod, mean, min, max, and sum Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 15 ++- modin/numpy/test/test_array.py | 164 ++++++++++++++++++++++++++++++++- 2 files changed, 171 insertions(+), 8 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 87d7508b794..a4366fc735e 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -254,7 +254,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if isinstance(input, pd.Series): input = input._query_compiler.to_numpy().flatten() args += [input] - output = self._to_numpy().__array_ufunc__(ufunc, method, *args, **kwargs) + output = self._to_numpy().__array_ufunc__( + ufunc, method, *args, **kwargs + ) if is_scalar(output): return output return array(output) @@ -309,6 +311,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def __array_function__(self, func, types, args, kwargs): from . import array_creation as creation, array_shaping as shaping, math + func_name = func.__name__ modin_func = None if hasattr(math, func_name): @@ -785,7 +788,11 @@ def floor_divide( if x2 == 0: # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. - result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) + result = ( + result.replace(numpy.inf, 0) + .replace(numpy.NINF, 0) + .replace(numpy.nan, 0) + ) return fix_dtypes_and_determine_return( result, self._ndim, dtype, out, where ) @@ -802,6 +809,7 @@ def floor_divide( # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) + result = result.replace(numpy.nan, 0) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __floordiv__ = floor_divide @@ -1099,8 +1107,7 @@ def astype(self, dtype, order="K", casting="unsafe", subok=True, copy=True): result = self._query_compiler.astype( {col_name: dtype for col_name in self._query_compiler.columns} ) - if copy: - self._query_compiler = result + if not copy and subok and numpy.issubdtype(self.dtype, dtype): return self return array(_query_compiler=result, _ndim=self._ndim) diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index e938ad4c4ba..4ad699d887e 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -31,6 +31,15 @@ def test_shape(size): assert modin_arr.shape == numpy_arr.shape +def test_dtype(): + numpy_arr = numpy.array([[1, "2"], [3, "4"]]) + modin_arr = np.array([[1, "2"], [3, "4"]]) + assert modin_arr.dtype == numpy_arr.dtype + modin_arr = modin_arr == modin_arr.T + numpy_arr = numpy_arr == numpy_arr.T + assert modin_arr.dtype == numpy_arr.dtype + + @pytest.mark.parametrize("operand1shape", [100, (3, 100)]) @pytest.mark.parametrize("operand2shape", [100, (3, 100)]) @pytest.mark.parametrize( @@ -44,6 +53,12 @@ def test_shape(size): "__rmul__", "__radd__", "__rsub__", + "__ge__", + "__gt__", + "__lt__", + "__le__", + "__eq__", + "__ne__", ], ) def test_basic_arithmetic_with_broadcast(operand1shape, operand2shape, operator): @@ -125,6 +140,7 @@ def test_scalar_arithmetic(size): (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg="__sub__ failed." ) + @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_array_ufunc(size): # Test ufunc.__call__ @@ -144,6 +160,7 @@ def test_array_ufunc(size): # We do not test ufunc.reduce and ufunc.accumulate, since these require a binary reduce # operation that Modin does not currently support. + @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_array_function(size): numpy_arr = numpy.random.randint(-100, 100, size=size) @@ -161,10 +178,13 @@ def test_array_function(size): numpy_result = numpy.sum(numpy_arr) assert numpy_result == modin_result + def test_array_where(): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) - with pytest.warns(UserWarning, match="np.where method with only condition specified"): + with pytest.warns( + UserWarning, match="np.where method with only condition specified" + ): warnings.filterwarnings("ignore", message="Distributing") modin_flat_arr.where() with pytest.raises(ValueError, match="np.where requires x and y"): @@ -183,12 +203,148 @@ def test_array_where(): numpy_result = numpy.where(numpy_flat_bool_arr, numpy_flat_arr, 5) numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) modin_result = modin_flat_bool_arr.where(x=modin_flat_arr, y=(-1 * modin_flat_arr)) - numpy_result = numpy.where(numpy_flat_bool_arr, numpy_flat_arr, (-1 * numpy_flat_arr)) + numpy_result = numpy.where( + numpy_flat_bool_arr, numpy_flat_arr, (-1 * numpy_flat_arr) + ) numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) numpy_arr = numpy_flat_arr.reshape((10, 10)) modin_arr = np.array(numpy_arr) modin_bool_arr = modin_arr > 0 numpy_bool_arr = numpy_arr > 0 - modin_result = modin_bool_arr.where(modin_arr, 10*modin_arr) - numpy_result = numpy.where(numpy_bool_arr, numpy_arr, 10*numpy_arr) + modin_result = modin_bool_arr.where(modin_arr, 10 * modin_arr) + numpy_result = numpy.where(numpy_bool_arr, numpy_arr, 10 * numpy_arr) numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + + +def test_max(): + pass + + +def test_min(): + pass + + +def test_mean(): + pass + + +def test_prod(): + pass + + +def test_abs(): + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal( + numpy.abs(numpy_flat_arr), np.abs(modin_flat_arr)._to_numpy() + ) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal( + numpy.abs(numpy_arr), np.abs(modin_arr)._to_numpy() + ) + + +def test_invert(): + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal(~numpy_flat_arr, (~modin_flat_arr)._to_numpy()) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal(~numpy_arr, (~modin_arr)._to_numpy()) + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) < 0 + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal(~numpy_flat_arr, (~modin_flat_arr)._to_numpy()) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal(~numpy_arr, (~modin_arr)._to_numpy()) + + +def test_flatten(): + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal( + numpy_flat_arr.flatten(), modin_flat_arr.flatten()._to_numpy() + ) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal( + numpy_arr.flatten(), modin_arr.flatten()._to_numpy() + ) + + +def test_transpose(): + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal( + numpy_flat_arr.transpose(), modin_flat_arr.transpose()._to_numpy() + ) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal( + numpy_arr.transpose(), modin_arr.transpose()._to_numpy() + ) + numpy.testing.assert_array_equal(numpy_arr.T, modin_arr.T._to_numpy()) + + +def test_astype(): + numpy_arr = numpy.array([[1, 2], [3, 4]]) + modin_arr = np.array([[1, 2], [3, 4]]) + modin_result = modin_arr.astype(numpy.float64) + numpy_result = numpy_arr.astype(numpy.float64) + assert modin_result.dtype == numpy_result.dtype + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.astype(str) + numpy_result = numpy_arr.astype(str) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_arr._to_numpy(), numpy_arr) + modin_result = modin_arr.astype(str, copy=False) + numpy_result = numpy_arr.astype(str, copy=False) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_arr._to_numpy(), numpy_arr) + modin_result = modin_arr.astype(numpy.float64, copy=False) + numpy_result = numpy_arr.astype(numpy.float64, copy=False) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_arr._to_numpy(), numpy_arr) + + +def test_zeros_like(): + modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr), np.zeros_like(modin_arr)._to_numpy() + ) + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr, dtype=numpy.int8), + np.zeros_like(modin_arr, dtype=numpy.int8)._to_numpy(), + ) + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr, shape=(10, 10)), + np.zeros_like(modin_arr, shape=(10, 10))._to_numpy(), + ) + modin_arr = np.array([[1, 2], [3, 4]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr), np.zeros_like(modin_arr)._to_numpy() + ) + + +def test_ones_like(): + modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr), np.ones_like(modin_arr)._to_numpy() + ) + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr, dtype=numpy.int8), + np.ones_like(modin_arr, dtype=numpy.int8)._to_numpy(), + ) + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr, shape=(10, 10)), + np.ones_like(modin_arr, shape=(10, 10))._to_numpy(), + ) + modin_arr = np.array([[1, 2], [3, 4]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr), np.ones_like(modin_arr)._to_numpy() + ) From c0a1ecc111155b64840cd03d78d63ee28bcdc2e7 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 17:19:37 -0800 Subject: [PATCH 28/42] Add tests Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 158 ++++++++++--- modin/numpy/test/test_array.py | 390 ++++++++++++++++++++++++++++++++- 2 files changed, 519 insertions(+), 29 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index a4366fc735e..acc1a07f57b 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -420,8 +420,17 @@ def max( result = self._query_compiler.max(axis=0) if keepdims: if initial is not None and result.lt(initial): - result = array( - _query_compiler=pd.Series([initial])._query_compiler, _ndim=1 + result = pd.Series([initial])._query_compiler + if initial is not None: + if out is not None: + out._query_compiler = ( + numpy.ones_like(out) * initial + )._query_compiler + else: + out = array([initial]) + if out is not None and out.shape != (1,): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return(result, 1, dtype, out, where) if initial is not None: @@ -439,7 +448,13 @@ def max( where=where, ) if keepdims: - return array(numpy.array([[result]])) + if out is not None and out.shape != (1, 1): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) + return fix_dtypes_and_determine_return( + array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + ) return result result = self._query_compiler.max(axis=axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim @@ -451,13 +466,22 @@ def max( return result if where else initial if not keepdims and axis != 1: result = result.transpose() - intermediate = fix_dtypes_and_determine_return(result, new_ndim, dtype, out) if initial is not None: - intermediate._query_compiler = (intermediate > initial).where( - intermediate, initial + if out is not None: + out._query_compiler = (numpy.ones_like(out) * initial)._query_compiler + else: + out = ( + numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) + * initial + ) + intermediate = fix_dtypes_and_determine_return( + result, new_ndim, dtype, out, where + ) + if initial is not None: + intermediate._query_compiler = ( + (intermediate > initial).where(intermediate, initial)._query_compiler ) - else: - return intermediate + return intermediate def min( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True @@ -473,8 +497,17 @@ def min( result = self._query_compiler.min(axis=0) if keepdims: if initial is not None and result.lt(initial): - result = array( - _query_compiler=pd.Series([initial])._query_compiler, _ndim=1 + result = pd.Series([initial])._query_compiler + if initial is not None: + if out is not None: + out._query_compiler = ( + numpy.ones_like(out) * initial + )._query_compiler + else: + out = array([initial]) + if out is not None and out.shape != (1,): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return(result, 1, dtype, out, where) if initial is not None: @@ -492,8 +525,13 @@ def min( where=where, ) if keepdims: - result = array(numpy.array([[result]])) - result._ndim = self._ndim + if out is not None and out.shape != (1, 1): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) + return fix_dtypes_and_determine_return( + array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + ) return result result = self._query_compiler.min(axis=axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim @@ -505,15 +543,22 @@ def min( return result if where else initial if not keepdims and axis != 1: result = result.transpose() + if initial is not None: + if out is not None: + out._query_compiler = (numpy.ones_like(out) * initial)._query_compiler + else: + out = ( + numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) + * initial + ) intermediate = fix_dtypes_and_determine_return( result, new_ndim, dtype, out, where ) if initial is not None: - intermediate._query_compiler = (intermediate < initial).where( - intermediate, initial + intermediate._query_compiler = ( + (intermediate < initial).where(intermediate, initial)._query_compiler ) - else: - return intermediate + return intermediate def __abs__( self, @@ -674,6 +719,10 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): raise numpy.AxisError(1, 1) result = self._query_compiler.mean(axis=0) if keepdims: + if out is not None and out.shape != (1,): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) return fix_dtypes_and_determine_return(result, 1, dtype, out, where) return result.to_numpy()[0, 0] if where else numpy.nan if axis is None: @@ -681,7 +730,13 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): axis=axis, dtype=dtype, out=out, keepdims=None, where=where ) if keepdims: - result = array(numpy.array([[result]])) + if out is not None and out.shape != (1, 1): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) + return fix_dtypes_and_determine_return( + array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + ) return result result = self._query_compiler.mean(axis=axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim @@ -689,6 +744,8 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): return result.to_numpy()[0, 0] if where else numpy.nan if not keepdims and axis != 1: result = result.transpose() + if out is not None: + out._query_compiler = (out * numpy.nan)._query_compiler return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __add__( @@ -845,6 +902,7 @@ def power( def prod( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): + initial = 1 if initial is None else initial check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: if axis == 1: @@ -853,8 +911,19 @@ def prod( if initial is not None: result = result.mul(initial) if keepdims: + if initial is not None: + if out is not None: + out._query_compiler = ( + numpy.ones_like(out) * initial + )._query_compiler + else: + out = array([initial]) + if out is not None and out.shape != (1,): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) return fix_dtypes_and_determine_return(result, 1, dtype, out, where) - return result.to_numpy()[0, 0] if where else 1 + return result.to_numpy()[0, 0] if where else initial if axis is None: result = self.flatten().prod( axis=axis, @@ -865,16 +934,30 @@ def prod( where=where, ) if keepdims: - result = array(numpy.array([[result]])) + if out is not None and out.shape != (1, 1): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) + return fix_dtypes_and_determine_return( + array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + ) return result result = self._query_compiler.prod(axis=axis) if initial is not None: result = result.mul(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: - return result.to_numpy()[0, 0] if where else 1 + return result.to_numpy()[0, 0] if where else initial if not keepdims and axis != 1: result = result.transpose() + if initial is not None: + if out is not None: + out._query_compiler = (numpy.ones_like(out) * initial)._query_compiler + else: + out = ( + numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) + * initial + ) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def multiply( @@ -1002,6 +1085,7 @@ def __rsub__( def sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): + initial = 0 if initial is None else initial check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: if axis == 1: @@ -1010,29 +1094,53 @@ def sum( if initial is not None: result = result.add(initial) if keepdims: + if initial is not None: + if out is not None: + out._query_compiler = ( + numpy.ones_like(out) * initial + )._query_compiler + else: + out = array([initial]) + if out is not None and out.shape != (1,): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) return fix_dtypes_and_determine_return(result, 1, dtype, out, where) - return result.to_numpy()[0, 0] if where else 0 + return result.to_numpy()[0, 0] if where else initial if axis is None: result = self.flatten().sum( axis=axis, dtype=dtype, out=out, - keepdims=keepdims, + keepdims=None, initial=initial, where=where, ) if keepdims: - result = array(numpy.array([[result]])) - result._ndim = self._ndim + if out is not None and out.shape != (1, 1): + raise ValueError( + f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" + ) + return fix_dtypes_and_determine_return( + array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + ) return result result = self._query_compiler.sum(axis=axis) if initial is not None: result = result.add(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: - return result.to_numpy()[0, 0] if where else 0 + return result.to_numpy()[0, 0] if where else initial if not keepdims and axis != 1: result = result.transpose() + if initial is not None: + if out is not None: + out._query_compiler = (numpy.ones_like(out) * initial)._query_compiler + else: + out = ( + numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) + * initial + ) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def flatten(self, order="C"): diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index 4ad699d887e..5fd8d1e9087 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -217,19 +217,401 @@ def test_array_where(): def test_max(): - pass + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.max() == numpy_arr.max() + modin_result = modin_arr.max(axis=0) + numpy_result = modin_arr.max(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.max(initial=200) + numpy_result = numpy_arr.max(initial=200) + assert modin_result == numpy_result + modin_result = modin_arr.max(initial=0, where=False) + numpy_result = numpy_arr.max(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.max(keepdims=True) + numpy_result = numpy_arr.max(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.max() == numpy_arr.max() + modin_result = modin_arr.max(axis=0) + numpy_result = numpy_arr.max(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(axis=0, keepdims=True) + numpy_result = numpy_arr.max(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(axis=1) + numpy_result = numpy_arr.max(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(axis=1, keepdims=True) + numpy_result = numpy_arr.max(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(initial=200) + numpy_result = numpy_arr.max(initial=200) + assert modin_result == numpy_result + modin_result = modin_arr.max(initial=0, where=False) + numpy_result = numpy_arr.max(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_result = modin_arr.max(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.max(out=modin_out, keepdims=True) + numpy_result = numpy_arr.max(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.max(axis=0, where=False, initial=4) + numpy_result = numpy_arr.max(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.max(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.max(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.max(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.max(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) def test_min(): - pass + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.min() == numpy_arr.min() + modin_result = modin_arr.min(axis=0) + numpy_result = modin_arr.min(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.min(initial=-200) + numpy_result = numpy_arr.min(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.min(initial=0, where=False) + numpy_result = numpy_arr.min(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.min(keepdims=True) + numpy_result = numpy_arr.min(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.min() == numpy_arr.min() + modin_result = modin_arr.min(axis=0) + numpy_result = numpy_arr.min(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(axis=0, keepdims=True) + numpy_result = numpy_arr.min(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(axis=1) + numpy_result = numpy_arr.min(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(axis=1, keepdims=True) + numpy_result = numpy_arr.min(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(initial=-200) + numpy_result = numpy_arr.min(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.min(initial=0, where=False) + numpy_result = numpy_arr.min(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_result = modin_arr.min(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.min(out=modin_out, keepdims=True) + numpy_result = numpy_arr.min(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.min(axis=0, where=False, initial=4) + numpy_result = numpy_arr.min(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.min(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.min(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.min(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.min(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + + +def test_sum(): + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.sum() == numpy_arr.sum() + modin_result = modin_arr.sum(axis=0) + numpy_result = modin_arr.sum(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.sum(initial=-200) + numpy_result = numpy_arr.sum(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.sum(initial=0, where=False) + numpy_result = numpy_arr.sum(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.sum(keepdims=True) + numpy_result = numpy_arr.sum(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.sum() == numpy_arr.sum() + modin_result = modin_arr.sum(axis=0) + numpy_result = numpy_arr.sum(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(axis=0, keepdims=True) + numpy_result = numpy_arr.sum(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(axis=1) + numpy_result = numpy_arr.sum(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(axis=1, keepdims=True) + numpy_result = numpy_arr.sum(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(initial=-200) + numpy_result = numpy_arr.sum(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.sum(initial=0, where=False) + numpy_result = numpy_arr.sum(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_result = modin_arr.sum(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.sum(out=modin_out, keepdims=True) + numpy_result = numpy_arr.sum(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.sum(axis=0, where=False, initial=4) + numpy_result = numpy_arr.sum(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.sum(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.sum(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) def test_mean(): - pass + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.mean() == numpy_arr.mean() + modin_result = modin_arr.mean(axis=0) + numpy_result = modin_arr.mean(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.mean() + numpy_result = numpy_arr.mean() + assert modin_result == numpy_result + modin_result = modin_arr.mean(keepdims=True) + numpy_result = numpy_arr.mean(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.mean() == numpy_arr.mean() + modin_result = modin_arr.mean(axis=0) + numpy_result = numpy_arr.mean(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean(axis=0, keepdims=True) + numpy_result = numpy_arr.mean(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean(axis=1) + numpy_result = numpy_arr.mean(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean(axis=1, keepdims=True) + numpy_result = numpy_arr.mean(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean() + numpy_result = numpy_arr.mean() + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_result = modin_arr.mean(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.mean(out=modin_out, keepdims=True) + numpy_result = numpy_arr.mean(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.mean(axis=0, where=False, out=modin_out) + numpy_result = numpy_arr.mean(axis=0, where=False, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.mean(axis=0, out=modin_out) + numpy_result = numpy_arr.mean(axis=0, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.mean(axis=0, out=modin_out, where=modin_where) + numpy_result = numpy_arr.mean(axis=0, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) def test_prod(): - pass + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.prod() == numpy_arr.prod() + modin_result = modin_arr.prod(axis=0) + numpy_result = modin_arr.prod(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.prod(initial=-200) + numpy_result = numpy_arr.prod(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.prod(initial=0, where=False) + numpy_result = numpy_arr.prod(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.prod(keepdims=True) + numpy_result = numpy_arr.prod(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.prod() == numpy_arr.prod() + modin_result = modin_arr.prod(axis=0) + numpy_result = numpy_arr.prod(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(axis=0, keepdims=True) + numpy_result = numpy_arr.prod(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(axis=1) + numpy_result = numpy_arr.prod(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(axis=1, keepdims=True) + numpy_result = numpy_arr.prod(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(initial=-200) + numpy_result = numpy_arr.prod(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.prod(initial=0, where=False) + numpy_result = numpy_arr.prod(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_result = modin_arr.prod(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.prod(out=modin_out, keepdims=True) + numpy_result = numpy_arr.prod(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.prod(axis=0, where=False, initial=4) + numpy_result = numpy_arr.prod(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.prod(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.prod(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-10, 10, size=(20, 20)) + modin_arr = np.array(numpy_arr) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) def test_abs(): From e70679665bc9f0d8f5d23b4dbd836de2da7e11bf Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 18:55:44 -0800 Subject: [PATCH 29/42] Bypass overflow dtype issues Signed-off-by: Rehan Durrani --- modin/numpy/test/test_array.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index 5fd8d1e9087..c541de6b7bd 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -287,6 +287,12 @@ def test_max(): numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) + modin_result = modin_arr.max(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.max(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) @@ -368,6 +374,12 @@ def test_min(): numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) + modin_result = modin_arr.min(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.min(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) @@ -449,6 +461,12 @@ def test_sum(): numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) + modin_result = modin_arr.sum(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.sum(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) @@ -521,6 +539,12 @@ def test_mean(): numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) + modin_result = modin_arr.mean(axis=1, out=modin_out) + numpy_result = numpy_arr.mean(axis=1, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) @@ -594,7 +618,7 @@ def test_prod(): numpy_result = numpy_arr.prod(axis=0, where=False, initial=4, out=numpy_out) numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-10, 10, size=(20, 20)) + numpy_arr = numpy.random.randint(-5, 5, size=(20, 20)) modin_arr = np.array(numpy_arr) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) @@ -604,6 +628,12 @@ def test_prod(): numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) + modin_result = modin_arr.prod(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.prod(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) From 23fe0c4294a3d5216abd345d4de15c7669bfc56c Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 19:32:38 -0800 Subject: [PATCH 30/42] Cast to output dtype Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 329 ++++++++++++++++++++++++++------- modin/numpy/test/test_array.py | 2 +- 2 files changed, 267 insertions(+), 64 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index acc1a07f57b..0c3abe8cb1f 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -122,7 +122,7 @@ def fix_dtypes_and_determine_return( if isinstance(where, array) and out is None: from array_creation import zeros_like - out = zeros_like(result) + out = zeros_like(result).astype(out_dtype) out._query_compiler = where.where(result, out)._query_compiler return out elif not where: @@ -427,7 +427,7 @@ def max( numpy.ones_like(out) * initial )._query_compiler else: - out = array([initial]) + out = array([initial]).astype(self.dtype) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" @@ -473,7 +473,7 @@ def max( out = ( numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) * initial - ) + ).astype(self.dtype) intermediate = fix_dtypes_and_determine_return( result, new_ndim, dtype, out, where ) @@ -504,7 +504,7 @@ def min( numpy.ones_like(out) * initial )._query_compiler else: - out = array([initial]) + out = array([initial]).astype(self.dtype) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" @@ -550,7 +550,7 @@ def min( out = ( numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) * initial - ) + ).astype(self.dtype) intermediate = fix_dtypes_and_determine_return( result, new_ndim, dtype, out, where ) @@ -569,8 +569,15 @@ def __abs__( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, casting=casting, subok=subok, where=where) - result = self._query_compiler.abs() + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).abs() if dtype is not None: result = result.astype({col_name: dtype for col_name in result.columns}) if out is not None: @@ -713,11 +720,18 @@ def __ne__(self, x2): return array(_query_compiler=result, _ndim=new_ndim) def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: if axis == 1: raise numpy.AxisError(1, 1) - result = self._query_compiler.mean(axis=0) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).mean(axis=0) if keepdims: if out is not None and out.shape != (1,): raise ValueError( @@ -726,8 +740,10 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): return fix_dtypes_and_determine_return(result, 1, dtype, out, where) return result.to_numpy()[0, 0] if where else numpy.nan if axis is None: - result = self.flatten().mean( - axis=axis, dtype=dtype, out=out, keepdims=None, where=where + result = ( + self.flatten() + .astype(out_dtype) + .mean(axis=axis, dtype=dtype, out=out, keepdims=None, where=where) ) if keepdims: if out is not None and out.shape != (1, 1): @@ -735,17 +751,23 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return( - array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + array(numpy.array([[result]])).astype(out_dtype)._query_compiler, + 2, + dtype, + out, + where, ) return result - result = self._query_compiler.mean(axis=axis) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).mean(axis=axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] if where else numpy.nan if not keepdims and axis != 1: result = result.transpose() if out is not None: - out._query_compiler = (out * numpy.nan)._query_compiler + out._query_compiler = (out * numpy.nan).astype(out_dtype)._query_compiler return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __add__( @@ -758,14 +780,27 @@ def __add__( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - result = self._query_compiler.add(x2) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).add(x2) return fix_dtypes_and_determine_return( result, self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) - result = caller._query_compiler.add(callee._query_compiler, **kwargs) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) + result = caller_qc.add(callee_qc, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __radd__( @@ -790,19 +825,36 @@ def divide( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.truediv(x2), self._ndim, dtype, out, where + self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).truediv(x2), + self._ndim, + dtype, + out, + where, ) caller, callee, new_ndim, kwargs = self._binary_op(x2) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) if caller._query_compiler != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object/2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.rtruediv(1D_object). - result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) + result = caller_qc.rtruediv(callee_qc, **kwargs) else: - result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) + result = caller_qc.truediv(callee_qc, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __truediv__ = divide @@ -817,16 +869,33 @@ def __rtruediv__( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.rtruediv(x2), self._ndim, dtype, out, where + self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).rtruediv(x2), + self._ndim, + dtype, + out, + where, ) caller, callee, new_ndim, kwargs = self._binary_op(x2) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) if caller._query_compiler != self._query_compiler: - result = caller._query_compiler.truediv(callee._query_compiler, **kwargs) + result = caller_qc.truediv(callee_qc, **kwargs) else: - result = caller._query_compiler.rtruediv(callee._query_compiler, **kwargs) + result = caller_qc.rtruediv(callee_qc, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def floor_divide( @@ -839,9 +908,16 @@ def floor_divide( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - result = self._query_compiler.floordiv(x2) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).floordiv(x2) if x2 == 0: # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. @@ -854,6 +930,12 @@ def floor_divide( result, self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._binary_op(x2) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using @@ -861,7 +943,7 @@ def floor_divide( raise NotImplementedError( "Using floor_divide with broadcast is not currently available in Modin." ) - result = caller._query_compiler.floordiv(callee._query_compiler, **kwargs) + result = caller_qc.floordiv(callee_qc, **kwargs) if callee._query_compiler.eq(0).any(): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. @@ -881,12 +963,29 @@ def power( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.pow(x2), self._ndim, dtype, out, where + self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).pow(x2), + self._ndim, + dtype, + out, + where, ) caller, callee, new_ndim, kwargs = self._binary_op(x2) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) if caller._query_compiler != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using @@ -894,7 +993,7 @@ def power( raise NotImplementedError( "Using power with broadcast is not currently available in Modin." ) - result = caller._query_compiler.pow(callee._query_compiler, **kwargs) + result = caller_qc.pow(callee_qc, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __pow__ = power @@ -902,22 +1001,31 @@ def power( def prod( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) initial = 1 if initial is None else initial check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: if axis == 1: raise numpy.AxisError(1, 1) - result = self._query_compiler.prod(axis=0) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).prod(axis=0) if initial is not None: result = result.mul(initial) if keepdims: if initial is not None: if out is not None: out._query_compiler = ( - numpy.ones_like(out) * initial - )._query_compiler + (numpy.ones_like(out) * initial) + .astype(out_dtype) + ._query_compiler + ) else: - out = array([initial]) + out = array([initial]).astype(out_dtype) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" @@ -925,13 +1033,17 @@ def prod( return fix_dtypes_and_determine_return(result, 1, dtype, out, where) return result.to_numpy()[0, 0] if where else initial if axis is None: - result = self.flatten().prod( - axis=axis, - dtype=dtype, - out=out, - keepdims=None, - initial=initial, - where=where, + result = ( + self.flatten() + .astype(out_dtype) + .prod( + axis=axis, + dtype=dtype, + out=out, + keepdims=None, + initial=initial, + where=where, + ) ) if keepdims: if out is not None and out.shape != (1, 1): @@ -939,10 +1051,16 @@ def prod( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return( - array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + array(numpy.array([[result]])).astype(out_dtype)._query_compiler, + 2, + dtype, + out, + where, ) return result - result = self._query_compiler.prod(axis=axis) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).prod(axis=axis) if initial is not None: result = result.mul(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim @@ -952,12 +1070,14 @@ def prod( result = result.transpose() if initial is not None: if out is not None: - out._query_compiler = (numpy.ones_like(out) * initial)._query_compiler + out._query_compiler = ( + (numpy.ones_like(out) * initial).astype(out_dtype)._query_compiler + ) else: out = ( numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) * initial - ) + ).astype(out_dtype) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def multiply( @@ -970,13 +1090,30 @@ def multiply( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.mul(x2), self._ndim, dtype, out, where + self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).mul(x2), + self._ndim, + dtype, + out, + where, ) caller, callee, new_ndim, kwargs = self._binary_op(x2) - result = caller._query_compiler.mul(callee._query_compiler, **kwargs) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) + result = caller_qc.mul(callee_qc, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __mul__ = multiply @@ -1003,9 +1140,16 @@ def remainder( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): - result = self._query_compiler.mod(x2) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).mod(x2) if x2 == 0: # NumPy's remainder by 0 works differently from pandas', so we need to fix # the output. @@ -1021,7 +1165,13 @@ def remainder( raise NotImplementedError( "Using remainder with broadcast is not currently available in Modin." ) - result = caller._query_compiler.mod(callee._query_compiler, **kwargs) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) + result = caller_qc.mod(callee_qc, **kwargs) if callee._query_compiler.eq(0).any(): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. @@ -1040,19 +1190,36 @@ def subtract( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.sub(x2), self._ndim, dtype, out, where + self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).sub(x2), + self._ndim, + dtype, + out, + where, ) caller, callee, new_ndim, kwargs = self._binary_op(x2) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) if caller._query_compiler != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object - 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.rsub(1D_object). - result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) + result = caller_qc.rsub(callee_qc, **kwargs) else: - result = caller._query_compiler.sub(callee._query_compiler, **kwargs) + result = caller_qc.sub(callee_qc, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __sub__ = subtract @@ -1067,40 +1234,64 @@ def __rsub__( dtype=None, subok=True, ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): return fix_dtypes_and_determine_return( - self._query_compiler.rsub(x2), self._ndim, dtype, out, where + self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).rsub(x2), + self._ndim, + dtype, + out, + where, ) caller, callee, new_ndim, kwargs = self._binary_op(x2) + caller_qc = caller._query_compiler.astype( + {col_name: out_dtype for col_name in caller._query_compiler.columns} + ) + callee_qc = callee._query_compiler.astype( + {col_name: out_dtype for col_name in callee._query_compiler.columns} + ) if caller._query_compiler != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object - 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.sub(1D_object). - result = caller._query_compiler.sub(callee._query_compiler, **kwargs) + result = caller_qc.sub(callee_qc, **kwargs) else: - result = caller._query_compiler.rsub(callee._query_compiler, **kwargs) + result = caller_qc.rsub(callee_qc, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): + out_dtype = ( + dtype + if dtype is not None + else (out.dtype if out is not None else self.dtype) + ) initial = 0 if initial is None else initial check_kwargs(keepdims=keepdims, where=where) if self._ndim == 1: if axis == 1: raise numpy.AxisError(1, 1) - result = self._query_compiler.sum(axis=0) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).sum(axis=0) if initial is not None: result = result.add(initial) if keepdims: if initial is not None: if out is not None: out._query_compiler = ( - numpy.ones_like(out) * initial + numpy.ones_like(out, dtype=out_dtype) * initial )._query_compiler else: - out = array([initial]) + out = array([initial], dtype=out_dtype) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" @@ -1108,13 +1299,17 @@ def sum( return fix_dtypes_and_determine_return(result, 1, dtype, out, where) return result.to_numpy()[0, 0] if where else initial if axis is None: - result = self.flatten().sum( - axis=axis, - dtype=dtype, - out=out, - keepdims=None, - initial=initial, - where=where, + result = ( + self.flatten() + .astype(out_dtype) + .sum( + axis=axis, + dtype=dtype, + out=out, + keepdims=None, + initial=initial, + where=where, + ) ) if keepdims: if out is not None and out.shape != (1, 1): @@ -1122,10 +1317,16 @@ def sum( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return( - array(numpy.array([[result]]))._query_compiler, 2, dtype, out, where + array(numpy.array([[result]], dtype=out_dtype))._query_compiler, + 2, + dtype, + out, + where, ) return result - result = self._query_compiler.sum(axis=axis) + result = self._query_compiler.astype( + {col_name: out_dtype for col_name in self._query_compiler.columns} + ).sum(axis=axis) if initial is not None: result = result.add(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim @@ -1135,12 +1336,14 @@ def sum( result = result.transpose() if initial is not None: if out is not None: - out._query_compiler = (numpy.ones_like(out) * initial)._query_compiler + out._query_compiler = ( + (numpy.ones_like(out) * initial).astype(out_dtype)._query_compiler + ) else: out = ( numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) * initial - ) + ).astype(out_dtype) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def flatten(self, order="C"): diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index c541de6b7bd..246cc8b511c 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -618,7 +618,7 @@ def test_prod(): numpy_result = numpy_arr.prod(axis=0, where=False, initial=4, out=numpy_out) numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-5, 5, size=(20, 20)) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) From 22b01e025187fece204df314516fb3e0c31a7cb1 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Sun, 5 Feb 2023 20:35:58 -0800 Subject: [PATCH 31/42] Fix lint Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 0c3abe8cb1f..3c897595ff2 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -122,7 +122,7 @@ def fix_dtypes_and_determine_return( if isinstance(where, array) and out is None: from array_creation import zeros_like - out = zeros_like(result).astype(out_dtype) + out = zeros_like(result).astype(dtype if dtype is not None else result.dtype) out._query_compiler = where.where(result, out)._query_compiler return out elif not where: From 52f0928bf63d87d4e9b3b6b716c162a6ca28a597 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 6 Feb 2023 10:40:32 -0800 Subject: [PATCH 32/42] Add defensive dimension check Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 3c897595ff2..ece1364b433 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -195,6 +195,10 @@ def __init__( else: target_kwargs[key] = locals()[key] arr = numpy.array(object, **target_kwargs) + assert arr.ndim in ( + 1, + 2, + ), "Modin.NumPy currently only supports 1D and 2D objects." self._ndim = len(arr.shape) if self._ndim > 2: ErrorMessage.not_implemented( From cfaa0663b8c4a0deeede1102142c5eba79a0c5bf Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 6 Feb 2023 13:39:44 -0800 Subject: [PATCH 33/42] Fix auto-cast issue Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 84 +++++++++++++++++++++++++++------- modin/numpy/test/test_array.py | 16 +++++++ 2 files changed, 84 insertions(+), 16 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index ece1364b433..a8df2b52eac 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -784,10 +784,15 @@ def __add__( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): @@ -829,10 +834,15 @@ def divide( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): @@ -873,10 +883,15 @@ def __rtruediv__( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): @@ -912,23 +927,28 @@ def floor_divide( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ).floordiv(x2) - if x2 == 0: + if x2 == 0 and numpy.issubdtype(out_dtype, numpy.integer): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = ( result.replace(numpy.inf, 0) .replace(numpy.NINF, 0) - .replace(numpy.nan, 0) + .where(self._query_compiler.ne(0), 0) ) return fix_dtypes_and_determine_return( result, self._ndim, dtype, out, where @@ -948,11 +968,16 @@ def floor_divide( "Using floor_divide with broadcast is not currently available in Modin." ) result = caller_qc.floordiv(callee_qc, **kwargs) - if callee._query_compiler.eq(0).any(): + if callee._query_compiler.eq(0).any() and numpy.issubdtype( + out_dtype, numpy.integer + ): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. - result = result.replace(numpy.inf, 0).replace(numpy.NINF, 0) - result = result.replace(numpy.nan, 0) + result = ( + result.replace(numpy.inf, 0) + .replace(numpy.NINF, 0) + .where(callee_qc.ne(0), 0) + ) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __floordiv__ = floor_divide @@ -967,10 +992,15 @@ def power( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): @@ -1094,10 +1124,15 @@ def multiply( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): @@ -1144,17 +1179,22 @@ def remainder( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ).mod(x2) - if x2 == 0: + if x2 == 0 and numpy.issubdtype(out_dtype, numpy.integer): # NumPy's remainder by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.NaN, 0) @@ -1176,7 +1216,9 @@ def remainder( {col_name: out_dtype for col_name in callee._query_compiler.columns} ) result = caller_qc.mod(callee_qc, **kwargs) - if callee._query_compiler.eq(0).any(): + if callee._query_compiler.eq(0).any() and numpy.issubdtype( + out_dtype, numpy.integer + ): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.NaN, 0) @@ -1194,10 +1236,15 @@ def subtract( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): @@ -1238,10 +1285,15 @@ def __rsub__( dtype=None, subok=True, ): + operand_dtype = ( + self.dtype + if not isinstance(x2, array) + else find_common_dtype([self.dtype, x2.dtype]) + ) out_dtype = ( dtype if dtype is not None - else (out.dtype if out is not None else self.dtype) + else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index 246cc8b511c..876d1715abd 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -100,6 +100,22 @@ def test_arithmetic(operator): ) +def test_arithmetic_nans_and_zeros(): + numpy_arr1 = numpy.array([[1, 0, 3], [numpy.nan, 0, numpy.nan]]) + numpy_arr2 = numpy.array([1, 0, 0]) + numpy.testing.assert_array_equal( + numpy_arr1 // numpy_arr2, + (np.array(numpy_arr1) // np.array(numpy_arr2))._to_numpy(), + ) + numpy.testing.assert_array_equal( + numpy.array([0]) // 0, (np.array([0]) // 0)._to_numpy() + ) + numpy.testing.assert_array_equal( + numpy.array([0], dtype=numpy.float64) // 0, + (np.array([0], dtype=numpy.float64) // 0)._to_numpy(), + ) + + @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_scalar_arithmetic(size): numpy_arr = numpy.random.randint(-100, 100, size=size) From f9be32d5f408124a6d76e135f75abf850bc854fa Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 6 Feb 2023 15:43:37 -0800 Subject: [PATCH 34/42] Fix CI bug Signed-off-by: Rehan Durrani --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6e74d64ec0..8622778dcec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -711,10 +711,10 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/test_series.py - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py - run: python -m pytest -n 2 modin/pandas/test/test_concat.py - - run: python -m pytest -n 2 modin/numpy/test/test_array.py if: matrix.engine == 'python' - run: python -m pytest modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2 if: matrix.engine != 'python' + - run: python -m pytest -n 2 modin/numpy/test/test_array.py - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py - run: python -m pytest -n 2 modin/pandas/test/test_general.py From 48967d8b6aab790608d8f3d711b007f368a47922 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 6 Feb 2023 16:30:47 -0800 Subject: [PATCH 35/42] Address review comments Signed-off-by: Rehan Durrani --- modin/numpy/__init__.py | 4 +- modin/numpy/arr.py | 15 +- modin/numpy/array_creation.py | 10 ++ modin/numpy/array_shaping.py | 36 +++-- modin/numpy/constants.py | 18 +++ modin/numpy/math.py | 246 ++++++++++++++++++++++++++++----- modin/numpy/test/test_array.py | 10 +- 7 files changed, 273 insertions(+), 66 deletions(-) diff --git a/modin/numpy/__init__.py b/modin/numpy/__init__.py index 1dae8dc0c1c..4ef184d171f 100644 --- a/modin/numpy/__init__.py +++ b/modin/numpy/__init__.py @@ -68,9 +68,9 @@ def where(condition, x=None, y=None): - if condition: + if condition is True: return x - if not condition: + if condition is False: return y if hasattr(condition, "where"): return condition.where(x=x, y=y) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index a8df2b52eac..ae933dd41fe 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -79,14 +79,14 @@ def check_how_broadcast_to_output(arr_in: "array", arr_out: "array"): ) elif arr_out._ndim == arr_in._ndim: return "broadcastable" - elif arr_out._ndim == 1: + if arr_out._ndim == 1: if prod(arr_in.shape) == arr_out.shape[0]: return "flatten" else: raise ValueError( f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" ) - elif arr_in._ndim == 1: + if arr_in._ndim == 1: if prod(arr_out.shape) == arr_in.shape[0]: return "reshape" else: @@ -135,12 +135,7 @@ def fix_dtypes_and_determine_return( def find_common_dtype(dtypes): if len(dtypes) == 1: return dtypes[0] - elif len(dtypes) == 2: - return numpy.promote_types(*dtypes) - midpoint = len(dtypes) // 2 - return numpy.promote_types( - find_common_dtype(dtypes[:midpoint]), find_common_dtype(dtypes[midpoint:]) - ) + return numpy.common_type(dtypes, []) class array(object): @@ -329,6 +324,10 @@ def __array_function__(self, func, types, args, kwargs): return modin_func(*args, **kwargs) def where(self, x=None, y=None): + if not is_bool_dtype(self.dtype): + raise NotImplementedError( + "Modin currently only supports where on condition arrays with boolean dtype." + ) if x is None and y is None: ErrorMessage.single_warning( "np.where method with only condition specified is not yet supported in Modin. Defaulting to NumPy." diff --git a/modin/numpy/array_creation.py b/modin/numpy/array_creation.py index 33bfb156bf4..295d4821793 100644 --- a/modin/numpy/array_creation.py +++ b/modin/numpy/array_creation.py @@ -31,12 +31,22 @@ def _create_array(dtype, shape, order, subok, numpy_method): def zeros_like(a, dtype=None, order="K", subok=True, shape=None): + if not isinstance(a, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for zeros_like, not {type(a)}. Defaulting to NumPy." + ) + return numpy.zeros_like(a, dtype=dtype, order=order, subok=subok, shape=shape) dtype = a.dtype if dtype is None else dtype shape = a.shape if shape is None else shape return _create_array(dtype, shape, order, subok, "zeros") def ones_like(a, dtype=None, order="K", subok=True, shape=None): + if not isinstance(a, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for ones_like, not {type(a)}. Defaulting to NumPy." + ) + return numpy.ones_like(a, dtype=dtype, order=order, subok=subok, shape=shape) dtype = a.dtype if dtype is None else dtype shape = a.shape if shape is None else shape return _create_array(dtype, shape, order, subok, "ones") diff --git a/modin/numpy/array_shaping.py b/modin/numpy/array_shaping.py index 8b01f0f5e8a..e9814e7a770 100644 --- a/modin/numpy/array_shaping.py +++ b/modin/numpy/array_shaping.py @@ -12,36 +12,42 @@ # governing permissions and limitations under the License. """Module houses array shaping methods for Modin's NumPy API.""" +import numpy + from modin.error_message import ErrorMessage +from .arr import array def ravel(a, order="C"): + if not isinstance(a, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for ravel, not {type(a)}. Defaulting to NumPy." + ) + return numpy.ravel(a, order=order) if order != "C": ErrorMessage.single_warning( "Array order besides 'C' is not currently supported in Modin. Defaulting to 'C' order." ) - if hasattr(a, "flatten"): - return a.flatten(order) - raise NotImplementedError( - f"Object of type {type(a)} does not have a flatten method to use for raveling." - ) + return a.flatten(order) def shape(a): - if hasattr(a, "shape"): - return a.shape - raise NotImplementedError( - f"Object of type {type(a)} does not have a shape property." - ) + if not isinstance(a, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for shape, not {type(a)}. Defaulting to NumPy." + ) + return numpy.shape(a) + return a.shape def transpose(a, axes=None): + if not isinstance(a, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for transpose, not {type(a)}. Defaulting to NumPy." + ) + return numpy.transpose(a, axes=axes) if axes is not None: raise NotImplementedError( "Modin does not support arrays higher than 2-dimensions. Please use `transpose` with `axis=None` on a 2-dimensional or lower object." ) - if hasattr(a, "transpose"): - return a.transpose() - raise NotImplementedError( - f"Object of type {type(a)} does not have a transpose method." - ) + return a.transpose() diff --git a/modin/numpy/constants.py b/modin/numpy/constants.py index 0d7576516f9..96b91503aa5 100644 --- a/modin/numpy/constants.py +++ b/modin/numpy/constants.py @@ -29,3 +29,21 @@ newaxis, pi, ) + +__all__ = [ + "Inf", + "Infinity", + "NAN", + "NINF", + "NZERO", + "NaN", + "PINF", + "PZERO", + "e", + "euler_gamma", + "inf", + "infty", + "nan", + "newaxis", + "pi", +] diff --git a/modin/numpy/math.py b/modin/numpy/math.py index 9d50e45ac53..b4bdbe9d6ee 100644 --- a/modin/numpy/math.py +++ b/modin/numpy/math.py @@ -13,14 +13,29 @@ import numpy +from .arr import array +from modin.error_message import ErrorMessage + def absolute( x, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x, "absolute"): - return x.absolute( - out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok + if not isinstance(x, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for absolute, not {type(x)}. Defaulting to NumPy." ) + return numpy.absolute( + x, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + return x.absolute( + out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok + ) abs = absolute @@ -29,8 +44,12 @@ def absolute( def add( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "__add__"): - return x1.__add__( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for add, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.add( + x1, x2, out=out, where=where, @@ -39,13 +58,26 @@ def add( dtype=dtype, subok=subok, ) + return x1.__add__( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def divide( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "divide"): - return x1.divide( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for divide, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.divide( + x1, x2, out=out, where=where, @@ -54,13 +86,26 @@ def divide( dtype=dtype, subok=subok, ) + return x1.divide( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def float_power( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "float_power"): - return x1.float_power( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for float_power, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.float_power( + x1, x2, out=out, where=where, @@ -69,13 +114,26 @@ def float_power( dtype=dtype, subok=subok, ) + return x1.float_power( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def floor_divide( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "floor_divide"): - return x1.floor_divide( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for floor_divide, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.floor_divide( + x1, x2, out=out, where=where, @@ -84,13 +142,26 @@ def floor_divide( dtype=dtype, subok=subok, ) + return x1.floor_divide( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def power( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "power"): - return x1.power( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for power, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.power( + x1, x2, out=out, where=where, @@ -99,18 +170,45 @@ def power( dtype=dtype, subok=subok, ) - - -def prod(a, axis=None, out=None, keepdims=None, where=True): - if hasattr(a, "prod"): - return a.prod(axis=axis, out=out, keepdims=keepdims, where=where) + return x1.power( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) + + +def prod(a, axis=None, out=None, keepdims=None, where=True, dtype=None, initial=None): + if not isinstance(a, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for prod, not {type(a)}. Defaulting to NumPy." + ) + return numpy.prod( + a, + axis=axis, + out=out, + keepdims=keepdims, + where=where, + dtype=dtype, + initial=initial, + ) + return a.prod( + axis=axis, out=out, keepdims=keepdims, where=where, dtype=dtype, initial=initial + ) def multiply( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "multiply"): - return x1.multiply( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for multiply, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.multiply( + x1, x2, out=out, where=where, @@ -119,13 +217,26 @@ def multiply( dtype=dtype, subok=subok, ) + return x1.multiply( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def remainder( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "remainder"): - return x1.remainder( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for remainder, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.remainder( + x1, x2, out=out, where=where, @@ -134,6 +245,15 @@ def remainder( dtype=dtype, subok=subok, ) + return x1.remainder( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) mod = remainder @@ -142,8 +262,12 @@ def remainder( def subtract( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "subtract"): - return x1.subtract( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for power, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.subtract( + x1, x2, out=out, where=where, @@ -152,18 +276,45 @@ def subtract( dtype=dtype, subok=subok, ) + return x1.subtract( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def sum(arr, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True): - if hasattr(arr, "sum"): - return arr.sum(axis) + if not isinstance(arr, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for sum, not {type(arr)}. Defaulting to NumPy." + ) + return numpy.sum( + arr, + axis=axis, + out=out, + keepdims=keepdims, + where=where, + dtype=dtype, + initial=initial, + ) + return arr.sum( + axis=axis, out=out, keepdims=keepdims, where=where, dtype=dtype, initial=initial + ) def true_divide( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): - if hasattr(x1, "divide"): - return x1.divide( + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for true_divide, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.true_divide( + x1, x2, out=out, where=where, @@ -172,11 +323,26 @@ def true_divide( dtype=dtype, subok=subok, ) + return x1.divide( + x2, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) def mean(x1, axis=None, dtype=None, out=None, keepdims=None, *, where=True): - if hasattr(x1, "mean"): - return x1.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where) + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for mean, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.mean( + x1, axis=axis, out=out, keepdims=keepdims, where=where, dtype=dtype + ) + return x1.mean(axis=axis, out=out, keepdims=keepdims, where=where, dtype=dtype) # Maximum and minimum are ufunc's in NumPy, which means that our array's __array_ufunc__ @@ -188,20 +354,28 @@ def mean(x1, axis=None, dtype=None, out=None, keepdims=None, *, where=True): def amax(x1, axis=None, out=None, keepdims=None, initial=None, where=True): - if hasattr(x1, "max"): - return x1.max( - axis=axis, out=out, keepdims=keepdims, initial=initial, where=where + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for amax, not {type(x1)}. Defaulting to NumPy." ) + return numpy.amax( + x1, axis=axis, out=out, keepdims=keepdims, initial=initial, where=where + ) + return x1.max(axis=axis, out=out, keepdims=keepdims, initial=initial, where=where) max = amax def amin(x1, axis=None, out=None, keepdims=None, initial=None, where=True): - if hasattr(x1, "min"): - return x1.min( - axis=axis, out=out, keepdims=keepdims, initial=initial, where=where + if not isinstance(x1, array): + ErrorMessage.single_warning( + f"Modin NumPy only supports objects of modin.numpy.array types for amin, not {type(x1)}. Defaulting to NumPy." + ) + return numpy.amin( + x1, axis=axis, out=out, keepdims=keepdims, initial=initial, where=where ) + return x1.min(axis=axis, out=out, keepdims=keepdims, initial=initial, where=where) min = amin diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index 876d1715abd..6cd87eda8f3 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -277,7 +277,7 @@ def test_max(): numpy_result = numpy_arr.max(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): - modin_result = modin_arr.max(out=modin_arr, keepdims=True) + modin_arr.max(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.max(out=modin_out, keepdims=True) @@ -364,7 +364,7 @@ def test_min(): numpy_result = numpy_arr.min(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): - modin_result = modin_arr.min(out=modin_arr, keepdims=True) + modin_arr.min(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.min(out=modin_out, keepdims=True) @@ -451,7 +451,7 @@ def test_sum(): numpy_result = numpy_arr.sum(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): - modin_result = modin_arr.sum(out=modin_arr, keepdims=True) + modin_arr.sum(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.sum(out=modin_out, keepdims=True) @@ -532,7 +532,7 @@ def test_mean(): numpy_result = numpy_arr.mean() assert modin_result == numpy_result with pytest.raises(ValueError): - modin_result = modin_arr.mean(out=modin_arr, keepdims=True) + modin_arr.mean(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.mean(out=modin_out, keepdims=True) @@ -616,7 +616,7 @@ def test_prod(): numpy_result = numpy_arr.prod(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): - modin_result = modin_arr.prod(out=modin_arr, keepdims=True) + modin_arr.prod(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.prod(out=modin_out, keepdims=True) From 5b1da61c48db4452788f9c1bda0a8e2ee2f48779 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 6 Feb 2023 16:47:23 -0800 Subject: [PATCH 36/42] Fix type computation and add check for where Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 7 ++++++- modin/numpy/test/test_array.py | 8 ++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index ae933dd41fe..ee2449da797 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -135,7 +135,12 @@ def fix_dtypes_and_determine_return( def find_common_dtype(dtypes): if len(dtypes) == 1: return dtypes[0] - return numpy.common_type(dtypes, []) + elif len(dtypes) == 2: + return numpy.promote_types(*dtypes) + midpoint = len(dtypes) // 2 + return numpy.promote_types( + find_common_dtype(dtypes[:midpoint]), find_common_dtype(dtypes[midpoint:]) + ) class array(object): diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index 6cd87eda8f3..c337c0d7e17 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -202,13 +202,13 @@ def test_array_where(): UserWarning, match="np.where method with only condition specified" ): warnings.filterwarnings("ignore", message="Distributing") - modin_flat_arr.where() + (modin_flat_arr <= 0).where() with pytest.raises(ValueError, match="np.where requires x and y"): - modin_flat_arr.where(x=["Should Fail."]) + (modin_flat_arr <= 0).where(x=["Should Fail."]) with pytest.warns(UserWarning, match="np.where not supported when both x and y"): warnings.filterwarnings("ignore", message="Distributing") - modin_result = modin_flat_arr.where(x=4, y=5) - numpy_result = numpy.where(numpy_flat_arr, 4, 5) + modin_result = (modin_flat_arr <= 0).where(x=4, y=5) + numpy_result = numpy.where(numpy_flat_arr <= 0, 4, 5) numpy.testing.assert_array_equal(numpy_result, modin_result._to_numpy()) modin_flat_bool_arr = modin_flat_arr <= 0 numpy_flat_bool_arr = numpy_flat_arr <= 0 From 74ed3a2623969461d3a5810ad8523a12f9144409 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 6 Feb 2023 16:57:47 -0800 Subject: [PATCH 37/42] Fix auto broadcast of out variable Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index ee2449da797..a666b82af97 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -70,29 +70,13 @@ def check_kwargs(order="C", subok=True, keepdims=None, casting="same_kind", wher ) -def check_how_broadcast_to_output(arr_in: "array", arr_out: "array"): +def check_can_broadcast_to_output(arr_in: "array", arr_out: "array"): if not isinstance(arr_out, array): raise TypeError("return arrays must be of modin.numpy.array type.") if arr_out._ndim == arr_in._ndim and arr_out.shape != arr_in.shape: raise ValueError( f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" ) - elif arr_out._ndim == arr_in._ndim: - return "broadcastable" - if arr_out._ndim == 1: - if prod(arr_in.shape) == arr_out.shape[0]: - return "flatten" - else: - raise ValueError( - f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" - ) - if arr_in._ndim == 1: - if prod(arr_out.shape) == arr_in.shape[0]: - return "reshape" - else: - raise ValueError( - f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" - ) def fix_dtypes_and_determine_return( @@ -105,15 +89,10 @@ def fix_dtypes_and_determine_return( result = array(_query_compiler=query_compiler_in, _ndim=_ndim) if out is not None: out = try_convert_from_interoperable_type(out) - broadcast_method = check_how_broadcast_to_output(result, out) + check_can_broadcast_to_output(result, out) result._query_compiler = result._query_compiler.astype( {col_name: out.dtype for col_name in result._query_compiler.columns} ) - if broadcast_method == "flatten": - result = result.flatten() - elif broadcast_method != "broadcastable": - # TODO(RehanSD): Replace this when reshape is implemented. - raise NotImplementedError("Reshape is currently not supported in Modin.") if isinstance(where, array): out._query_compiler = where.where(result, out)._query_compiler elif where: @@ -590,21 +569,9 @@ def __abs__( result = result.astype({col_name: dtype for col_name in result.columns}) if out is not None: out = try_convert_from_interoperable_type(out) - broadcast_method = check_how_broadcast_to_output(self, out) - if broadcast_method == "broadcastable": - out._query_compiler = result - return out - elif broadcast_method == "flatten": - out._query_compiler = ( - array(_query_compiler=result, _ndim=self._ndim) - .flatten() - ._query_compiler - ) - else: - # TODO(RehanSD): Replace this when reshape is implemented. - raise NotImplementedError( - "Reshape is currently not supported in Modin." - ) + check_can_broadcast_to_output(self, out) + out._query_compiler = result + return out return array(_query_compiler=result, _ndim=self._ndim) absolute = __abs__ From 3244b79af4cd0fad3aba1a95648e43404a4701e0 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Feb 2023 15:11:33 -0800 Subject: [PATCH 38/42] Address review comments (break up testing into multiple files, and fix formatting issues) Signed-off-by: Rehan Durrani --- .github/workflows/ci.yml | 9 + .github/workflows/push-to-master.yml | 6 + .github/workflows/push.yml | 6 + modin/numpy/arr.py | 1 + modin/numpy/array_creation.py | 2 + modin/numpy/array_shaping.py | 1 + modin/numpy/test/test_array.py | 616 +----------------- modin/numpy/test/test_array_arithmetic.py | 164 +++++ modin/numpy/test/test_array_axis_functions.py | 445 +++++++++++++ modin/numpy/test/test_array_creation.py | 58 ++ modin/pandas/series.py | 4 + 11 files changed, 697 insertions(+), 615 deletions(-) create mode 100644 modin/numpy/test/test_array_arithmetic.py create mode 100644 modin/numpy/test/test_array_axis_functions.py create mode 100644 modin/numpy/test/test_array_creation.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8622778dcec..ed046a9f0e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -624,6 +624,9 @@ jobs: - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_reshape.py - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_general.py - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array.py + - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_creation.py + - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_arithmetic.py + - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_axis_functions.py - run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh - run: ./.github/workflows/sql_server/set_up_sql_server.sh - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_io.py --verbose @@ -715,6 +718,9 @@ jobs: - run: python -m pytest modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2 if: matrix.engine != 'python' - run: python -m pytest -n 2 modin/numpy/test/test_array.py + - run: python -m pytest -n 2 modin/numpy/test/test_array_creation.py + - run: python -m pytest -n 2 modin/numpy/test/test_array_arithmetic.py + - run: python -m pytest -n 2 modin/numpy/test/test_array_axis_functions.py - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py - run: python -m pytest -n 2 modin/pandas/test/test_general.py @@ -845,6 +851,9 @@ jobs: - modin/pandas/test/test_general.py - modin/pandas/test/test_io.py - modin/numpy/test/test_array.py + - modin/numpy/test/test_array_creation.py + - modin/numpy/test/test_array_axis_functions.py + - modin/numpy/test/test_array_arithmetic.py env: MODIN_ENGINE: ${{matrix.engine}} name: test-windows diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml index 3886886b596..dfa8dd03ee9 100644 --- a/.github/workflows/push-to-master.yml +++ b/.github/workflows/push-to-master.yml @@ -56,6 +56,9 @@ jobs: python -m pytest modin/pandas/test/dataframe/test_window.py python -m pytest modin/pandas/test/test_series.py python -m pytest modin/numpy/test/test_array.py + python -m pytest modin/numpy/test/test_array_creation.py + python -m pytest modin/numpy/test/test_array_arithmetic.py + python -m pytest modin/numpy/test/test_array_axis_functions.py python -m pytest modin/pandas/test/test_rolling.py python -m pytest modin/pandas/test/test_concat.py python -m pytest modin/pandas/test/test_groupby.py @@ -123,6 +126,9 @@ jobs: - modin/pandas/test/dataframe/test_pickle.py - modin/pandas/test/test_series.py - modin/numpy/test/test_array.py + - modin/numpy/test/test_array_creation.py + - modin/numpy/test/test_array_arithmetic.py + - modin/numpy/test/test_array_axis_functions.py - modin/pandas/test/test_rolling.py - modin/pandas/test/test_concat.py - modin/pandas/test/test_groupby.py diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index bc01666948d..b88dbcda923 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -298,6 +298,9 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py - run: python -m pytest -n 2 modin/pandas/test/test_series.py - run: python -m pytest -n 2 modin/numpy/test/test_array.py + - run: python -m pytest -n 2 modin/numpy/test/test_array_creation.py + - run: python -m pytest -n 2 modin/numpy/test/test_array_arithmetic.py + - run: python -m pytest -n 2 modin/numpy/test/test_array_axis_functions.py - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py - run: python -m pytest -n 2 modin/pandas/test/test_concat.py if: matrix.engine == 'python' @@ -336,6 +339,9 @@ jobs: - modin/pandas/test/dataframe/test_pickle.py - modin/pandas/test/test_series.py - modin/numpy/test/test_array.py + - modin/numpy/test/test_array_creation.py + - modin/numpy/test/test_array_arithmetic.py + - modin/numpy/test/test_array_axis_functions.py - modin/pandas/test/test_rolling.py - modin/pandas/test/test_concat.py - modin/pandas/test/test_groupby.py diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index a666b82af97..e308f74814d 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -10,6 +10,7 @@ # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. + """Module houses ``array`` class, that is distributed version of ``numpy.array``.""" from math import prod diff --git a/modin/numpy/array_creation.py b/modin/numpy/array_creation.py index 295d4821793..fb55baa6593 100644 --- a/modin/numpy/array_creation.py +++ b/modin/numpy/array_creation.py @@ -12,7 +12,9 @@ # governing permissions and limitations under the License. """Module houses array creation methods for Modin's NumPy API.""" + import numpy + from modin.error_message import ErrorMessage from .arr import array diff --git a/modin/numpy/array_shaping.py b/modin/numpy/array_shaping.py index e9814e7a770..47d518a9673 100644 --- a/modin/numpy/array_shaping.py +++ b/modin/numpy/array_shaping.py @@ -12,6 +12,7 @@ # governing permissions and limitations under the License. """Module houses array shaping methods for Modin's NumPy API.""" + import numpy from modin.error_message import ErrorMessage diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index c337c0d7e17..ad688c7a884 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -14,6 +14,7 @@ import numpy import pytest import warnings + import modin.numpy as np @@ -40,123 +41,6 @@ def test_dtype(): assert modin_arr.dtype == numpy_arr.dtype -@pytest.mark.parametrize("operand1shape", [100, (3, 100)]) -@pytest.mark.parametrize("operand2shape", [100, (3, 100)]) -@pytest.mark.parametrize( - "operator", - [ - "__add__", - "__sub__", - "__truediv__", - "__mul__", - "__rtruediv__", - "__rmul__", - "__radd__", - "__rsub__", - "__ge__", - "__gt__", - "__lt__", - "__le__", - "__eq__", - "__ne__", - ], -) -def test_basic_arithmetic_with_broadcast(operand1shape, operand2shape, operator): - """Test of operators that support broadcasting.""" - operand1 = numpy.random.randint(-100, 100, size=operand1shape) - operand2 = numpy.random.randint(-100, 100, size=operand2shape) - modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) - numpy_result = getattr(operand1, operator)(operand2) - if operator not in ["__truediv__", "__rtruediv__"]: - numpy.testing.assert_array_equal( - modin_result._to_numpy(), - numpy_result, - err_msg=f"Binary Op {operator} failed.", - ) - else: - # Truediv can have precision issues. - numpy.testing.assert_array_almost_equal( - modin_result._to_numpy(), - numpy_result, - decimal=12, - err_msg="Binary Op __truediv__ failed.", - ) - - -@pytest.mark.parametrize("operator", ["__pow__", "__floordiv__", "__mod__"]) -def test_arithmetic(operator): - """Test of operators that do not yet support broadcasting""" - for size, textdim in ((100, "1D"), ((10, 10), "2D")): - operand1 = numpy.random.randint(-100, 100, size=size) - lower_bound = -100 if operator != "__pow__" else 0 - operand2 = numpy.random.randint(lower_bound, 100, size=size) - modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) - numpy_result = getattr(operand1, operator)(operand2) - numpy.testing.assert_array_almost_equal( - modin_result._to_numpy(), - numpy_result, - decimal=12, - err_msg=f"Binary Op {operator} failed on {textdim} arrays.", - ) - - -def test_arithmetic_nans_and_zeros(): - numpy_arr1 = numpy.array([[1, 0, 3], [numpy.nan, 0, numpy.nan]]) - numpy_arr2 = numpy.array([1, 0, 0]) - numpy.testing.assert_array_equal( - numpy_arr1 // numpy_arr2, - (np.array(numpy_arr1) // np.array(numpy_arr2))._to_numpy(), - ) - numpy.testing.assert_array_equal( - numpy.array([0]) // 0, (np.array([0]) // 0)._to_numpy() - ) - numpy.testing.assert_array_equal( - numpy.array([0], dtype=numpy.float64) // 0, - (np.array([0], dtype=numpy.float64) // 0)._to_numpy(), - ) - - -@pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) -def test_scalar_arithmetic(size): - numpy_arr = numpy.random.randint(-100, 100, size=size) - modin_arr = np.array(numpy_arr) - scalar = numpy.random.randint(1, 100) - numpy.testing.assert_array_equal( - (scalar * modin_arr)._to_numpy(), scalar * numpy_arr, err_msg="__mul__ failed." - ) - numpy.testing.assert_array_equal( - (modin_arr * scalar)._to_numpy(), - scalar * numpy_arr, - err_msg="__rmul__ failed.", - ) - numpy.testing.assert_array_equal( - (scalar / modin_arr)._to_numpy(), - scalar / numpy_arr, - err_msg="__rtruediv__ failed.", - ) - numpy.testing.assert_array_equal( - (modin_arr / scalar)._to_numpy(), - numpy_arr / scalar, - err_msg="__truediv__ failed.", - ) - numpy.testing.assert_array_equal( - (scalar + modin_arr)._to_numpy(), - scalar + numpy_arr, - err_msg="__radd__ failed.", - ) - numpy.testing.assert_array_equal( - (modin_arr + scalar)._to_numpy(), scalar + numpy_arr, err_msg="__add__ failed." - ) - numpy.testing.assert_array_equal( - (scalar - modin_arr)._to_numpy(), - scalar - numpy_arr, - err_msg="__rsub__ failed.", - ) - numpy.testing.assert_array_equal( - (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg="__sub__ failed." - ) - - @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_array_ufunc(size): # Test ufunc.__call__ @@ -232,462 +116,6 @@ def test_array_where(): numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) -def test_max(): - # Test 1D - numpy_arr = numpy.random.randint(-100, 100, size=100) - modin_arr = np.array(numpy_arr) - assert modin_arr.max() == numpy_arr.max() - modin_result = modin_arr.max(axis=0) - numpy_result = modin_arr.max(axis=0) - assert modin_result == numpy_result - modin_result = modin_arr.max(initial=200) - numpy_result = numpy_arr.max(initial=200) - assert modin_result == numpy_result - modin_result = modin_arr.max(initial=0, where=False) - numpy_result = numpy_arr.max(initial=0, where=False) - assert modin_result == numpy_result - modin_result = modin_arr.max(keepdims=True) - numpy_result = numpy_arr.max(keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - # Test 2D - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - assert modin_arr.max() == numpy_arr.max() - modin_result = modin_arr.max(axis=0) - numpy_result = numpy_arr.max(axis=0) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.max(axis=0, keepdims=True) - numpy_result = numpy_arr.max(axis=0, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.max(axis=1) - numpy_result = numpy_arr.max(axis=1) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.max(axis=1, keepdims=True) - numpy_result = numpy_arr.max(axis=1, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.max(initial=200) - numpy_result = numpy_arr.max(initial=200) - assert modin_result == numpy_result - modin_result = modin_arr.max(initial=0, where=False) - numpy_result = numpy_arr.max(initial=0, where=False) - assert modin_result == numpy_result - with pytest.raises(ValueError): - modin_arr.max(out=modin_arr, keepdims=True) - modin_out = np.array([[1]]) - numpy_out = modin_out._to_numpy() - modin_result = modin_arr.max(out=modin_out, keepdims=True) - numpy_result = numpy_arr.max(out=numpy_out, keepdims=True) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - modin_result = modin_arr.max(axis=0, where=False, initial=4) - numpy_result = numpy_arr.max(axis=0, where=False, initial=4) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.max(axis=0, where=False, initial=4, out=modin_out) - numpy_result = numpy_arr.max(axis=0, where=False, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.max(axis=0, initial=4, out=modin_out) - numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.max(axis=1, initial=4, out=modin_out) - numpy_result = numpy_arr.max(axis=1, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - numpy_where = numpy.full(20, False) - numpy_where[:10] = True - numpy.random.shuffle(numpy_where) - modin_where = np.array(numpy_where) - modin_result = modin_arr.max(axis=0, initial=4, out=modin_out, where=modin_where) - numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out, where=numpy_where) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - - -def test_min(): - # Test 1D - numpy_arr = numpy.random.randint(-100, 100, size=100) - modin_arr = np.array(numpy_arr) - assert modin_arr.min() == numpy_arr.min() - modin_result = modin_arr.min(axis=0) - numpy_result = modin_arr.min(axis=0) - assert modin_result == numpy_result - modin_result = modin_arr.min(initial=-200) - numpy_result = numpy_arr.min(initial=-200) - assert modin_result == numpy_result - modin_result = modin_arr.min(initial=0, where=False) - numpy_result = numpy_arr.min(initial=0, where=False) - assert modin_result == numpy_result - modin_result = modin_arr.min(keepdims=True) - numpy_result = numpy_arr.min(keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - # Test 2D - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - assert modin_arr.min() == numpy_arr.min() - modin_result = modin_arr.min(axis=0) - numpy_result = numpy_arr.min(axis=0) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.min(axis=0, keepdims=True) - numpy_result = numpy_arr.min(axis=0, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.min(axis=1) - numpy_result = numpy_arr.min(axis=1) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.min(axis=1, keepdims=True) - numpy_result = numpy_arr.min(axis=1, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.min(initial=-200) - numpy_result = numpy_arr.min(initial=-200) - assert modin_result == numpy_result - modin_result = modin_arr.min(initial=0, where=False) - numpy_result = numpy_arr.min(initial=0, where=False) - assert modin_result == numpy_result - with pytest.raises(ValueError): - modin_arr.min(out=modin_arr, keepdims=True) - modin_out = np.array([[1]]) - numpy_out = modin_out._to_numpy() - modin_result = modin_arr.min(out=modin_out, keepdims=True) - numpy_result = numpy_arr.min(out=numpy_out, keepdims=True) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - modin_result = modin_arr.min(axis=0, where=False, initial=4) - numpy_result = numpy_arr.min(axis=0, where=False, initial=4) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.min(axis=0, where=False, initial=4, out=modin_out) - numpy_result = numpy_arr.min(axis=0, where=False, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.min(axis=0, initial=4, out=modin_out) - numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.min(axis=1, initial=4, out=modin_out) - numpy_result = numpy_arr.min(axis=1, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - numpy_where = numpy.full(20, False) - numpy_where[:10] = True - numpy.random.shuffle(numpy_where) - modin_where = np.array(numpy_where) - modin_result = modin_arr.min(axis=0, initial=4, out=modin_out, where=modin_where) - numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out, where=numpy_where) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - - -def test_sum(): - # Test 1D - numpy_arr = numpy.random.randint(-100, 100, size=100) - modin_arr = np.array(numpy_arr) - assert modin_arr.sum() == numpy_arr.sum() - modin_result = modin_arr.sum(axis=0) - numpy_result = modin_arr.sum(axis=0) - assert modin_result == numpy_result - modin_result = modin_arr.sum(initial=-200) - numpy_result = numpy_arr.sum(initial=-200) - assert modin_result == numpy_result - modin_result = modin_arr.sum(initial=0, where=False) - numpy_result = numpy_arr.sum(initial=0, where=False) - assert modin_result == numpy_result - modin_result = modin_arr.sum(keepdims=True) - numpy_result = numpy_arr.sum(keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - # Test 2D - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - assert modin_arr.sum() == numpy_arr.sum() - modin_result = modin_arr.sum(axis=0) - numpy_result = numpy_arr.sum(axis=0) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.sum(axis=0, keepdims=True) - numpy_result = numpy_arr.sum(axis=0, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.sum(axis=1) - numpy_result = numpy_arr.sum(axis=1) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.sum(axis=1, keepdims=True) - numpy_result = numpy_arr.sum(axis=1, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.sum(initial=-200) - numpy_result = numpy_arr.sum(initial=-200) - assert modin_result == numpy_result - modin_result = modin_arr.sum(initial=0, where=False) - numpy_result = numpy_arr.sum(initial=0, where=False) - assert modin_result == numpy_result - with pytest.raises(ValueError): - modin_arr.sum(out=modin_arr, keepdims=True) - modin_out = np.array([[1]]) - numpy_out = modin_out._to_numpy() - modin_result = modin_arr.sum(out=modin_out, keepdims=True) - numpy_result = numpy_arr.sum(out=numpy_out, keepdims=True) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - modin_result = modin_arr.sum(axis=0, where=False, initial=4) - numpy_result = numpy_arr.sum(axis=0, where=False, initial=4) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.sum(axis=0, where=False, initial=4, out=modin_out) - numpy_result = numpy_arr.sum(axis=0, where=False, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out) - numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.sum(axis=1, initial=4, out=modin_out) - numpy_result = numpy_arr.sum(axis=1, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - numpy_where = numpy.full(20, False) - numpy_where[:10] = True - numpy.random.shuffle(numpy_where) - modin_where = np.array(numpy_where) - modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out, where=modin_where) - numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out, where=numpy_where) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - - -def test_mean(): - # Test 1D - numpy_arr = numpy.random.randint(-100, 100, size=100) - modin_arr = np.array(numpy_arr) - assert modin_arr.mean() == numpy_arr.mean() - modin_result = modin_arr.mean(axis=0) - numpy_result = modin_arr.mean(axis=0) - assert modin_result == numpy_result - modin_result = modin_arr.mean() - numpy_result = numpy_arr.mean() - assert modin_result == numpy_result - modin_result = modin_arr.mean(keepdims=True) - numpy_result = numpy_arr.mean(keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - # Test 2D - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - assert modin_arr.mean() == numpy_arr.mean() - modin_result = modin_arr.mean(axis=0) - numpy_result = numpy_arr.mean(axis=0) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.mean(axis=0, keepdims=True) - numpy_result = numpy_arr.mean(axis=0, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.mean(axis=1) - numpy_result = numpy_arr.mean(axis=1) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.mean(axis=1, keepdims=True) - numpy_result = numpy_arr.mean(axis=1, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.mean() - numpy_result = numpy_arr.mean() - assert modin_result == numpy_result - with pytest.raises(ValueError): - modin_arr.mean(out=modin_arr, keepdims=True) - modin_out = np.array([[1]]) - numpy_out = modin_out._to_numpy() - modin_result = modin_arr.mean(out=modin_out, keepdims=True) - numpy_result = numpy_arr.mean(out=numpy_out, keepdims=True) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.mean(axis=0, where=False, out=modin_out) - numpy_result = numpy_arr.mean(axis=0, where=False, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.mean(axis=0, out=modin_out) - numpy_result = numpy_arr.mean(axis=0, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.mean(axis=1, out=modin_out) - numpy_result = numpy_arr.mean(axis=1, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - numpy_where = numpy.full(20, False) - numpy_where[:10] = True - numpy.random.shuffle(numpy_where) - modin_where = np.array(numpy_where) - modin_result = modin_arr.mean(axis=0, out=modin_out, where=modin_where) - numpy_result = numpy_arr.mean(axis=0, out=numpy_out, where=numpy_where) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - - -def test_prod(): - # Test 1D - numpy_arr = numpy.random.randint(-100, 100, size=100) - modin_arr = np.array(numpy_arr) - assert modin_arr.prod() == numpy_arr.prod() - modin_result = modin_arr.prod(axis=0) - numpy_result = modin_arr.prod(axis=0) - assert modin_result == numpy_result - modin_result = modin_arr.prod(initial=-200) - numpy_result = numpy_arr.prod(initial=-200) - assert modin_result == numpy_result - modin_result = modin_arr.prod(initial=0, where=False) - numpy_result = numpy_arr.prod(initial=0, where=False) - assert modin_result == numpy_result - modin_result = modin_arr.prod(keepdims=True) - numpy_result = numpy_arr.prod(keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - # Test 2D - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - assert modin_arr.prod() == numpy_arr.prod() - modin_result = modin_arr.prod(axis=0) - numpy_result = numpy_arr.prod(axis=0) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.prod(axis=0, keepdims=True) - numpy_result = numpy_arr.prod(axis=0, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.prod(axis=1) - numpy_result = numpy_arr.prod(axis=1) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.prod(axis=1, keepdims=True) - numpy_result = numpy_arr.prod(axis=1, keepdims=True) - assert modin_result.shape == numpy_result.shape - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - modin_result = modin_arr.prod(initial=-200) - numpy_result = numpy_arr.prod(initial=-200) - assert modin_result == numpy_result - modin_result = modin_arr.prod(initial=0, where=False) - numpy_result = numpy_arr.prod(initial=0, where=False) - assert modin_result == numpy_result - with pytest.raises(ValueError): - modin_arr.prod(out=modin_arr, keepdims=True) - modin_out = np.array([[1]]) - numpy_out = modin_out._to_numpy() - modin_result = modin_arr.prod(out=modin_out, keepdims=True) - numpy_result = numpy_arr.prod(out=numpy_out, keepdims=True) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - modin_result = modin_arr.prod(axis=0, where=False, initial=4) - numpy_result = numpy_arr.prod(axis=0, where=False, initial=4) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.prod(axis=0, where=False, initial=4, out=modin_out) - numpy_result = numpy_arr.prod(axis=0, where=False, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) - modin_arr = np.array(numpy_arr) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out) - numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - modin_result = modin_arr.prod(axis=1, initial=4, out=modin_out) - numpy_result = numpy_arr.prod(axis=1, initial=4, out=numpy_out) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - numpy_out = numpy.ones(20) - modin_out = np.array(numpy_out) - numpy_where = numpy.full(20, False) - numpy_where[:10] = True - numpy.random.shuffle(numpy_where) - modin_where = np.array(numpy_where) - modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out, where=modin_where) - numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out, where=numpy_where) - numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) - numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) - - -def test_abs(): - numpy_flat_arr = numpy.random.randint(-100, 100, size=100) - modin_flat_arr = np.array(numpy_flat_arr) - numpy.testing.assert_array_equal( - numpy.abs(numpy_flat_arr), np.abs(modin_flat_arr)._to_numpy() - ) - numpy_arr = numpy_flat_arr.reshape((10, 10)) - modin_arr = np.array(numpy_arr) - numpy.testing.assert_array_equal( - numpy.abs(numpy_arr), np.abs(modin_arr)._to_numpy() - ) - - -def test_invert(): - numpy_flat_arr = numpy.random.randint(-100, 100, size=100) - modin_flat_arr = np.array(numpy_flat_arr) - numpy.testing.assert_array_equal(~numpy_flat_arr, (~modin_flat_arr)._to_numpy()) - numpy_arr = numpy_flat_arr.reshape((10, 10)) - modin_arr = np.array(numpy_arr) - numpy.testing.assert_array_equal(~numpy_arr, (~modin_arr)._to_numpy()) - numpy_flat_arr = numpy.random.randint(-100, 100, size=100) < 0 - modin_flat_arr = np.array(numpy_flat_arr) - numpy.testing.assert_array_equal(~numpy_flat_arr, (~modin_flat_arr)._to_numpy()) - numpy_arr = numpy_flat_arr.reshape((10, 10)) - modin_arr = np.array(numpy_arr) - numpy.testing.assert_array_equal(~numpy_arr, (~modin_arr)._to_numpy()) - - def test_flatten(): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) @@ -734,45 +162,3 @@ def test_astype(): numpy_result = numpy_arr.astype(numpy.float64, copy=False) numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) numpy.testing.assert_array_equal(modin_arr._to_numpy(), numpy_arr) - - -def test_zeros_like(): - modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) - numpy_arr = modin_arr._to_numpy() - numpy.testing.assert_array_equal( - numpy.zeros_like(numpy_arr), np.zeros_like(modin_arr)._to_numpy() - ) - numpy.testing.assert_array_equal( - numpy.zeros_like(numpy_arr, dtype=numpy.int8), - np.zeros_like(modin_arr, dtype=numpy.int8)._to_numpy(), - ) - numpy.testing.assert_array_equal( - numpy.zeros_like(numpy_arr, shape=(10, 10)), - np.zeros_like(modin_arr, shape=(10, 10))._to_numpy(), - ) - modin_arr = np.array([[1, 2], [3, 4]]) - numpy_arr = modin_arr._to_numpy() - numpy.testing.assert_array_equal( - numpy.zeros_like(numpy_arr), np.zeros_like(modin_arr)._to_numpy() - ) - - -def test_ones_like(): - modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) - numpy_arr = modin_arr._to_numpy() - numpy.testing.assert_array_equal( - numpy.ones_like(numpy_arr), np.ones_like(modin_arr)._to_numpy() - ) - numpy.testing.assert_array_equal( - numpy.ones_like(numpy_arr, dtype=numpy.int8), - np.ones_like(modin_arr, dtype=numpy.int8)._to_numpy(), - ) - numpy.testing.assert_array_equal( - numpy.ones_like(numpy_arr, shape=(10, 10)), - np.ones_like(modin_arr, shape=(10, 10))._to_numpy(), - ) - modin_arr = np.array([[1, 2], [3, 4]]) - numpy_arr = modin_arr._to_numpy() - numpy.testing.assert_array_equal( - numpy.ones_like(numpy_arr), np.ones_like(modin_arr)._to_numpy() - ) diff --git a/modin/numpy/test/test_array_arithmetic.py b/modin/numpy/test/test_array_arithmetic.py new file mode 100644 index 00000000000..e4ddd0381ac --- /dev/null +++ b/modin/numpy/test/test_array_arithmetic.py @@ -0,0 +1,164 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import numpy +import pytest + +import modin.numpy as np + + +@pytest.mark.parametrize("operand1_shape", [100, (3, 100)]) +@pytest.mark.parametrize("operand2_shape", [100, (3, 100)]) +@pytest.mark.parametrize( + "operator", + [ + "__add__", + "__sub__", + "__truediv__", + "__mul__", + "__rtruediv__", + "__rmul__", + "__radd__", + "__rsub__", + "__ge__", + "__gt__", + "__lt__", + "__le__", + "__eq__", + "__ne__", + ], +) +def test_basic_arithmetic_with_broadcast(operand1_shape, operand2_shape, operator): + """Test of operators that support broadcasting.""" + operand1 = numpy.random.randint(-100, 100, size=operand1_shape) + operand2 = numpy.random.randint(-100, 100, size=operand2_shape) + modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) + numpy_result = getattr(operand1, operator)(operand2) + if operator not in ["__truediv__", "__rtruediv__"]: + numpy.testing.assert_array_equal( + modin_result._to_numpy(), + numpy_result, + err_msg=f"Binary Op {operator} failed.", + ) + else: + # Truediv can have precision issues, where thanks to floating point error, the numbers + # aren't exactly the same across both, but are functionally equivalent, since the difference + # is less than 1e-12. + numpy.testing.assert_array_almost_equal( + modin_result._to_numpy(), + numpy_result, + decimal=12, + err_msg="Binary Op __truediv__ failed.", + ) + + +@pytest.mark.parametrize("operator", ["__pow__", "__floordiv__", "__mod__"]) +def test_arithmetic(operator): + """Test of operators that do not yet support broadcasting""" + for size, textdim in ((100, "1D"), ((10, 10), "2D")): + operand1 = numpy.random.randint(-100, 100, size=size) + lower_bound = -100 if operator != "__pow__" else 0 + operand2 = numpy.random.randint(lower_bound, 100, size=size) + modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) + numpy_result = getattr(operand1, operator)(operand2) + numpy.testing.assert_array_almost_equal( + modin_result._to_numpy(), + numpy_result, + decimal=12, + err_msg=f"Binary Op {operator} failed on {textdim} arrays.", + ) + + +def test_arithmetic_nans_and_zeros(): + numpy_arr1 = numpy.array([[1, 0, 3], [numpy.nan, 0, numpy.nan]]) + numpy_arr2 = numpy.array([1, 0, 0]) + numpy.testing.assert_array_equal( + numpy_arr1 // numpy_arr2, + (np.array(numpy_arr1) // np.array(numpy_arr2))._to_numpy(), + ) + numpy.testing.assert_array_equal( + numpy.array([0]) // 0, (np.array([0]) // 0)._to_numpy() + ) + numpy.testing.assert_array_equal( + numpy.array([0], dtype=numpy.float64) // 0, + (np.array([0], dtype=numpy.float64) // 0)._to_numpy(), + ) + + +@pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) +def test_scalar_arithmetic(size): + numpy_arr = numpy.random.randint(-100, 100, size=size) + modin_arr = np.array(numpy_arr) + scalar = numpy.random.randint(1, 100) + numpy.testing.assert_array_equal( + (scalar * modin_arr)._to_numpy(), scalar * numpy_arr, err_msg="__mul__ failed." + ) + numpy.testing.assert_array_equal( + (modin_arr * scalar)._to_numpy(), + scalar * numpy_arr, + err_msg="__rmul__ failed.", + ) + numpy.testing.assert_array_equal( + (scalar / modin_arr)._to_numpy(), + scalar / numpy_arr, + err_msg="__rtruediv__ failed.", + ) + numpy.testing.assert_array_equal( + (modin_arr / scalar)._to_numpy(), + numpy_arr / scalar, + err_msg="__truediv__ failed.", + ) + numpy.testing.assert_array_equal( + (scalar + modin_arr)._to_numpy(), + scalar + numpy_arr, + err_msg="__radd__ failed.", + ) + numpy.testing.assert_array_equal( + (modin_arr + scalar)._to_numpy(), scalar + numpy_arr, err_msg="__add__ failed." + ) + numpy.testing.assert_array_equal( + (scalar - modin_arr)._to_numpy(), + scalar - numpy_arr, + err_msg="__rsub__ failed.", + ) + numpy.testing.assert_array_equal( + (modin_arr - scalar)._to_numpy(), numpy_arr - scalar, err_msg="__sub__ failed." + ) + + +def test_abs(): + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal( + numpy.abs(numpy_flat_arr), np.abs(modin_flat_arr)._to_numpy() + ) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal( + numpy.abs(numpy_arr), np.abs(modin_arr)._to_numpy() + ) + + +def test_invert(): + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal(~numpy_flat_arr, (~modin_flat_arr)._to_numpy()) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal(~numpy_arr, (~modin_arr)._to_numpy()) + numpy_flat_arr = numpy.random.randint(-100, 100, size=100) < 0 + modin_flat_arr = np.array(numpy_flat_arr) + numpy.testing.assert_array_equal(~numpy_flat_arr, (~modin_flat_arr)._to_numpy()) + numpy_arr = numpy_flat_arr.reshape((10, 10)) + modin_arr = np.array(numpy_arr) + numpy.testing.assert_array_equal(~numpy_arr, (~modin_arr)._to_numpy()) diff --git a/modin/numpy/test/test_array_axis_functions.py b/modin/numpy/test/test_array_axis_functions.py new file mode 100644 index 00000000000..925adb9b089 --- /dev/null +++ b/modin/numpy/test/test_array_axis_functions.py @@ -0,0 +1,445 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import numpy +import pytest + +import modin.numpy as np + + +def test_max(): + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.max() == numpy_arr.max() + modin_result = modin_arr.max(axis=0) + numpy_result = modin_arr.max(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.max(initial=200) + numpy_result = numpy_arr.max(initial=200) + assert modin_result == numpy_result + modin_result = modin_arr.max(initial=0, where=False) + numpy_result = numpy_arr.max(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.max(keepdims=True) + numpy_result = numpy_arr.max(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.max() == numpy_arr.max() + modin_result = modin_arr.max(axis=0) + numpy_result = numpy_arr.max(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(axis=0, keepdims=True) + numpy_result = numpy_arr.max(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(axis=1) + numpy_result = numpy_arr.max(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(axis=1, keepdims=True) + numpy_result = numpy_arr.max(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.max(initial=200) + numpy_result = numpy_arr.max(initial=200) + assert modin_result == numpy_result + modin_result = modin_arr.max(initial=0, where=False) + numpy_result = numpy_arr.max(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_arr.max(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.max(out=modin_out, keepdims=True) + numpy_result = numpy_arr.max(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.max(axis=0, where=False, initial=4) + numpy_result = numpy_arr.max(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.max(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.max(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.max(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.max(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.max(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.max(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + + +def test_min(): + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.min() == numpy_arr.min() + modin_result = modin_arr.min(axis=0) + numpy_result = modin_arr.min(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.min(initial=-200) + numpy_result = numpy_arr.min(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.min(initial=0, where=False) + numpy_result = numpy_arr.min(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.min(keepdims=True) + numpy_result = numpy_arr.min(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.min() == numpy_arr.min() + modin_result = modin_arr.min(axis=0) + numpy_result = numpy_arr.min(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(axis=0, keepdims=True) + numpy_result = numpy_arr.min(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(axis=1) + numpy_result = numpy_arr.min(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(axis=1, keepdims=True) + numpy_result = numpy_arr.min(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.min(initial=-200) + numpy_result = numpy_arr.min(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.min(initial=0, where=False) + numpy_result = numpy_arr.min(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_arr.min(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.min(out=modin_out, keepdims=True) + numpy_result = numpy_arr.min(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.min(axis=0, where=False, initial=4) + numpy_result = numpy_arr.min(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.min(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.min(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.min(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.min(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.min(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.min(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + + +def test_sum(): + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.sum() == numpy_arr.sum() + modin_result = modin_arr.sum(axis=0) + numpy_result = modin_arr.sum(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.sum(initial=-200) + numpy_result = numpy_arr.sum(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.sum(initial=0, where=False) + numpy_result = numpy_arr.sum(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.sum(keepdims=True) + numpy_result = numpy_arr.sum(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.sum() == numpy_arr.sum() + modin_result = modin_arr.sum(axis=0) + numpy_result = numpy_arr.sum(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(axis=0, keepdims=True) + numpy_result = numpy_arr.sum(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(axis=1) + numpy_result = numpy_arr.sum(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(axis=1, keepdims=True) + numpy_result = numpy_arr.sum(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.sum(initial=-200) + numpy_result = numpy_arr.sum(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.sum(initial=0, where=False) + numpy_result = numpy_arr.sum(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_arr.sum(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.sum(out=modin_out, keepdims=True) + numpy_result = numpy_arr.sum(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.sum(axis=0, where=False, initial=4) + numpy_result = numpy_arr.sum(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.sum(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.sum(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.sum(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.sum(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + + +def test_mean(): + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.mean() == numpy_arr.mean() + modin_result = modin_arr.mean(axis=0) + numpy_result = modin_arr.mean(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.mean() + numpy_result = numpy_arr.mean() + assert modin_result == numpy_result + modin_result = modin_arr.mean(keepdims=True) + numpy_result = numpy_arr.mean(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.mean() == numpy_arr.mean() + modin_result = modin_arr.mean(axis=0) + numpy_result = numpy_arr.mean(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean(axis=0, keepdims=True) + numpy_result = numpy_arr.mean(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean(axis=1) + numpy_result = numpy_arr.mean(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean(axis=1, keepdims=True) + numpy_result = numpy_arr.mean(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.mean() + numpy_result = numpy_arr.mean() + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_arr.mean(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.mean(out=modin_out, keepdims=True) + numpy_result = numpy_arr.mean(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.mean(axis=0, where=False, out=modin_out) + numpy_result = numpy_arr.mean(axis=0, where=False, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.mean(axis=0, out=modin_out) + numpy_result = numpy_arr.mean(axis=0, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.mean(axis=1, out=modin_out) + numpy_result = numpy_arr.mean(axis=1, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.mean(axis=0, out=modin_out, where=modin_where) + numpy_result = numpy_arr.mean(axis=0, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + + +def test_prod(): + # Test 1D + numpy_arr = numpy.random.randint(-100, 100, size=100) + modin_arr = np.array(numpy_arr) + assert modin_arr.prod() == numpy_arr.prod() + modin_result = modin_arr.prod(axis=0) + numpy_result = modin_arr.prod(axis=0) + assert modin_result == numpy_result + modin_result = modin_arr.prod(initial=-200) + numpy_result = numpy_arr.prod(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.prod(initial=0, where=False) + numpy_result = numpy_arr.prod(initial=0, where=False) + assert modin_result == numpy_result + modin_result = modin_arr.prod(keepdims=True) + numpy_result = numpy_arr.prod(keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + # Test 2D + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + assert modin_arr.prod() == numpy_arr.prod() + modin_result = modin_arr.prod(axis=0) + numpy_result = numpy_arr.prod(axis=0) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(axis=0, keepdims=True) + numpy_result = numpy_arr.prod(axis=0, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(axis=1) + numpy_result = numpy_arr.prod(axis=1) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(axis=1, keepdims=True) + numpy_result = numpy_arr.prod(axis=1, keepdims=True) + assert modin_result.shape == numpy_result.shape + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + modin_result = modin_arr.prod(initial=-200) + numpy_result = numpy_arr.prod(initial=-200) + assert modin_result == numpy_result + modin_result = modin_arr.prod(initial=0, where=False) + numpy_result = numpy_arr.prod(initial=0, where=False) + assert modin_result == numpy_result + with pytest.raises(ValueError): + modin_arr.prod(out=modin_arr, keepdims=True) + modin_out = np.array([[1]]) + numpy_out = modin_out._to_numpy() + modin_result = modin_arr.prod(out=modin_out, keepdims=True) + numpy_result = numpy_arr.prod(out=numpy_out, keepdims=True) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + modin_result = modin_arr.prod(axis=0, where=False, initial=4) + numpy_result = numpy_arr.prod(axis=0, where=False, initial=4) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.prod(axis=0, where=False, initial=4, out=modin_out) + numpy_result = numpy_arr.prod(axis=0, where=False, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) + modin_arr = np.array(numpy_arr) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out) + numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + modin_result = modin_arr.prod(axis=1, initial=4, out=modin_out) + numpy_result = numpy_arr.prod(axis=1, initial=4, out=numpy_out) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) + numpy_out = numpy.ones(20) + modin_out = np.array(numpy_out) + numpy_where = numpy.full(20, False) + numpy_where[:10] = True + numpy.random.shuffle(numpy_where) + modin_where = np.array(numpy_where) + modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out, where=modin_where) + numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out, where=numpy_where) + numpy.testing.assert_array_equal(modin_result._to_numpy(), numpy_result) + numpy.testing.assert_array_equal(modin_out._to_numpy(), numpy_out) diff --git a/modin/numpy/test/test_array_creation.py b/modin/numpy/test/test_array_creation.py new file mode 100644 index 00000000000..02606db2b9f --- /dev/null +++ b/modin/numpy/test/test_array_creation.py @@ -0,0 +1,58 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import numpy + +import modin.numpy as np + + +def test_zeros_like(): + modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr), np.zeros_like(modin_arr)._to_numpy() + ) + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr, dtype=numpy.int8), + np.zeros_like(modin_arr, dtype=numpy.int8)._to_numpy(), + ) + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr, shape=(10, 10)), + np.zeros_like(modin_arr, shape=(10, 10))._to_numpy(), + ) + modin_arr = np.array([[1, 2], [3, 4]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.zeros_like(numpy_arr), np.zeros_like(modin_arr)._to_numpy() + ) + + +def test_ones_like(): + modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr), np.ones_like(modin_arr)._to_numpy() + ) + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr, dtype=numpy.int8), + np.ones_like(modin_arr, dtype=numpy.int8)._to_numpy(), + ) + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr, shape=(10, 10)), + np.ones_like(modin_arr, shape=(10, 10))._to_numpy(), + ) + modin_arr = np.array([[1, 2], [3, 4]]) + numpy_arr = modin_arr._to_numpy() + numpy.testing.assert_array_equal( + numpy.ones_like(numpy_arr), np.ones_like(modin_arr)._to_numpy() + ) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 49aae1d1232..15d6678c96c 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -500,6 +500,10 @@ def values(self): # noqa: RT01, D200 data = self.to_numpy() if isinstance(self.dtype, pd.CategoricalDtype): + from modin.config import ExperimentalNumPyAPI + + if ExperimentalNumPyAPI.get(): + data = data._to_numpy() data = pd.Categorical(data, dtype=self.dtype) return data From a3d57fee9e15be3d066292d69e560bd9c8e17bfd Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Feb 2023 15:23:58 -0800 Subject: [PATCH 39/42] Fix naming Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index e308f74814d..3e038fa4fb0 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -178,7 +178,7 @@ def __init__( assert arr.ndim in ( 1, 2, - ), "Modin.NumPy currently only supports 1D and 2D objects." + ), "modin.numpy currently only supports 1D and 2D objects." self._ndim = len(arr.shape) if self._ndim > 2: ErrorMessage.not_implemented( From 18588b812713af1adfa1f877c5724cb0d3b233be Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 8 Feb 2023 11:02:47 -0800 Subject: [PATCH 40/42] Add to_numpy Signed-off-by: Rehan Durrani --- modin/utils.py | 86 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/modin/utils.py b/modin/utils.py index 89d9a3162e0..11c37610aef 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -15,15 +15,7 @@ import importlib import types -from typing import ( - Any, - Callable, - List, - Mapping, - Optional, - Union, - TypeVar, -) +from typing import Any, Callable, List, Mapping, Optional, Union, TypeVar, overload import re import sys import json @@ -44,7 +36,7 @@ from pandas.util._print_versions import _get_sys_info, _get_dependency_info # type: ignore[attr-defined] from pandas._typing import JSONSerializable -from modin.config import Engine, StorageFormat, IsExperimental +from modin.config import Engine, StorageFormat, IsExperimental, ExperimentalNumPyAPI from modin._version import get_versions T = TypeVar("T") @@ -71,6 +63,22 @@ def to_pandas(self) -> Any: # noqa: GL08 pass +@runtime_checkable +class SupportsPublicToNumPy(Protocol): # noqa: PR01 + """Structural type for objects with a ``to_numpy`` method (without a leading underscore).""" + + def to_numpy(self) -> Any: # noqa: GL08 + pass + + +@runtime_checkable +class SupportsPrivateToNumPy(Protocol): # noqa: PR01 + """Structural type for objects with a ``_to_numpy`` method (note the leading underscore).""" + + def _to_numpy(self) -> Any: # noqa: GL08 + pass + + MIN_RAY_VERSION = version.parse("1.4.0") MIN_DASK_VERSION = version.parse("2.22.0") MIN_UNIDIST_VERSION = version.parse("0.2.1") @@ -452,6 +460,64 @@ def to_pandas(modin_obj: SupportsPrivateToPandas) -> Any: return modin_obj._to_pandas() +@overload +def to_numpy(modin_obj: SupportsPrivateToNumPy) -> Any: + """ + Convert a Modin array to a NumPy array. + + Parameters + ---------- + modin_obj : modin.numpy.array + The Modin array to convert. + + Returns + ------- + numpy.array + Converted object with type depending on input. + """ + ... + + +@overload +def to_numpy(modin_obj: SupportsPublicToNumPy) -> Any: + """ + Convert a Modin DataFrame/Series to a NumPy array. + + Parameters + ---------- + modin_obj : modin.DataFrame, modin.Series + The Modin DataFrame/Series to convert. + + Returns + ------- + numpy.array + Converted object with type depending on input. + """ + ... + + +def to_numpy(modin_obj: Union[SupportsPrivateToNumPy, SupportsPublicToNumPy]) -> Any: + """ + Convert a Modin object to a NumPy array. + + Parameters + ---------- + modin_obj : modin.DataFrame, modin.Series, modin.numpy.array + The modin distributed object to convert. + + Returns + ------- + numpy.array + Converted object with type depending on input. + """ + if isinstance(modin_obj, SupportsPrivateToNumPy): + return modin_obj._to_numpy() + array = modin_obj.to_numpy() + if ExperimentalNumPyAPI.get(): + array = array._to_numpy() + return array + + def hashable(obj: bool) -> bool: """ Return whether the `obj` is hashable. From 19acf64c3d78fda3f1e1fe52f98b1ca15d14d48b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 8 Feb 2023 12:28:42 -0800 Subject: [PATCH 41/42] Add warning about numpy api and fix lint Signed-off-by: Rehan Durrani --- modin/numpy/arr.py | 3 +++ modin/utils.py | 40 +++------------------------------------- 2 files changed, 6 insertions(+), 37 deletions(-) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 3e038fa4fb0..06e624deb82 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -149,6 +149,9 @@ def __init__( _query_compiler=None, _ndim=None, ): + ErrorMessage.single_warning( + "Using Modin's new NumPy API. To convert from a Modin object to a NumPy array, either turn off the ExperimentalNumPyAPI flag, or use `modin.utils.to_numpy`." + ) if _query_compiler is not None: self._query_compiler = _query_compiler self._ndim = _ndim diff --git a/modin/utils.py b/modin/utils.py index 11c37610aef..b8e0556131b 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -460,43 +460,9 @@ def to_pandas(modin_obj: SupportsPrivateToPandas) -> Any: return modin_obj._to_pandas() -@overload -def to_numpy(modin_obj: SupportsPrivateToNumPy) -> Any: - """ - Convert a Modin array to a NumPy array. - - Parameters - ---------- - modin_obj : modin.numpy.array - The Modin array to convert. - - Returns - ------- - numpy.array - Converted object with type depending on input. - """ - ... - - -@overload -def to_numpy(modin_obj: SupportsPublicToNumPy) -> Any: - """ - Convert a Modin DataFrame/Series to a NumPy array. - - Parameters - ---------- - modin_obj : modin.DataFrame, modin.Series - The Modin DataFrame/Series to convert. - - Returns - ------- - numpy.array - Converted object with type depending on input. - """ - ... - - -def to_numpy(modin_obj: Union[SupportsPrivateToNumPy, SupportsPublicToNumPy]) -> Any: +def to_numpy( + modin_obj: Union[SupportsPrivateToNumPy, SupportsPublicToNumPy] +) -> np.ndarray: """ Convert a Modin object to a NumPy array. From 1bf6c007312bf10d35e5c2fcf662b9f5d3ac504e Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 8 Feb 2023 12:34:15 -0800 Subject: [PATCH 42/42] Fix lint Signed-off-by: Rehan Durrani --- modin/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/utils.py b/modin/utils.py index b8e0556131b..de3f859db8a 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -15,7 +15,7 @@ import importlib import types -from typing import Any, Callable, List, Mapping, Optional, Union, TypeVar, overload +from typing import Any, Callable, List, Mapping, Optional, Union, TypeVar import re import sys import json @@ -469,7 +469,7 @@ def to_numpy( Parameters ---------- modin_obj : modin.DataFrame, modin.Series, modin.numpy.array - The modin distributed object to convert. + The Modin distributed object to convert. Returns -------