diff --git a/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java b/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java index 4f72d7e25cd..7cb26d39a2c 100644 --- a/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java +++ b/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java @@ -286,6 +286,7 @@ public void verifyArguments(Class[] argTypes) { StringBuilder argTypesStr = new StringBuilder(); for (int i = 0; i < argTypes.length; i++) { Class argType = argTypes[i]; + argType = argType == boolean.class ? Boolean.class : argType; // if there are more arguments than parameters, we'll need to consider the last parameter as a varargs // parameter. This is not ideal. We should look for a better way to handle this, i.e. a way to convey that diff --git a/py/server/deephaven/jcompat.py b/py/server/deephaven/jcompat.py index 75820a87f14..e6e921fd8f1 100644 --- a/py/server/deephaven/jcompat.py +++ b/py/server/deephaven/jcompat.py @@ -5,7 +5,7 @@ """ This module provides Java compatibility support including convenience functions to create some widely used Java data structures from corresponding Python ones in order to be able to call Java methods. """ -from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Tuple, Literal, Optional +from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Optional import jpy import numpy as np @@ -13,7 +13,7 @@ from deephaven import dtypes, DHError from deephaven._wrapper import unwrap, wrap_j_object, JObjectWrapper -from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP, _J_ARRAY_NP_TYPE_MAP +from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP _NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE _JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility") @@ -216,14 +216,8 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t dtype (DType): The dtype of the Java array j_array (jpy.JType): The Java array to convert conv_null (bool): If True, convert nulls to the null value for the dtype - type_promotion (bool): Ignored when conv_null is False. When type_promotion is False, (1) input Java integer, - boolean, or character arrays containing Deephaven nulls yield an exception, (2) input Java float or double - arrays containing Deephaven nulls have null values converted to np.nan, and (3) input Java arrays without - Deephaven nulls are converted to the target type. When type_promotion is True, (1) input Java integer, - boolean, or character arrays containing Deephaven nulls are converted to np.float64 arrays and Deephaven - null values are converted to np.nan, (2) input Java float or double arrays containing Deephaven nulls have - null values converted to np.nan, and (3) input Java arrays without Deephaven nulls are converted to the - target type. Defaults to False. + type_promotion (bool): Ignored when conv_null is False. When conv_null is True, see the description for the same + named parameter in dh_nulls_to_nan(). Returns: np.ndarray: The numpy array or None if the Java array is None @@ -255,26 +249,49 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t np_array = np.array(j_array, np.object_) if conv_null: - if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype): - if dtype in (dtypes.float32, dtypes.float64): - np_array = np.copy(np_array) - np_array[np_array == dh_null] = np.nan - else: - if dtype is dtypes.bool_: # needs to change its type to byte for dh null detection - np_array = np.frombuffer(np_array, np.byte) - - if any(np_array[np_array == dh_null]): - if not type_promotion: - raise DHError(f"Problem creating numpy array. Java {dtype} array contains Deephaven null values, but numpy {np_array.dtype} array does not support null values") - np_array = np_array.astype(np.float64) - np_array[np_array == dh_null] = np.nan - else: - if dtype is dtypes.bool_: # needs to change its type back to bool - np_array = np.frombuffer(np_array, np.bool_) - return np_array + return dh_null_to_nan(np_array, type_promotion) return np_array +def dh_null_to_nan(np_array: np.ndarray, type_promotion: bool = False) -> np.ndarray: + """Converts Deephaven primitive null values in the given numpy array to np.nan. No conversion is performed on + non-primitive types. + + Note, the input numpy array is modified in place if it is of a float or double type. If that's not a desired behavior, + pass a copy of the array instead. For input arrays of other types, a new array is always returned. + + Args: + np_array (np.ndarray): The numpy array to convert + type_promotion (bool): When False, integer, boolean, or character arrays will cause an exception to be raised. + When True, integer, boolean, or character arrays are converted to new np.float64 arrays and Deephaven null + values in them are converted to np.nan. Numpy arrays of float or double types are not affected by this flag + and Deephaven nulls will always be converted to np.nan in place. Defaults to False. + + Returns: + np.ndarray: The numpy array with Deephaven nulls converted to np.nan. + + Raises: + DHError + """ + if not isinstance(np_array, np.ndarray): + raise DHError(message="The given np_array argument is not a numpy array.") + + dtype = dtypes.from_np_dtype(np_array.dtype) + if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype): + if dtype in (dtypes.float32, dtypes.float64): + np_array = np.copy(np_array) + np_array[np_array == dh_null] = np.nan + else: + if not type_promotion: + raise DHError(message=f"failed to convert DH nulls to np.nan in the numpy array. The array is " + f"of {np_array.dtype.type} type but type_promotion is False") + if dtype is dtypes.bool_: # needs to change its type to byte for dh null detection + np_array = np.frombuffer(np_array, np.byte) + + np_array = np_array.astype(np.float64) + np_array[np_array == dh_null] = np.nan + + return np_array def _j_array_to_series(dtype: DType, j_array: jpy.JType, conv_null: bool) -> pd.Series: """Produce a copy of the specified Java array as a pandas.Series object. diff --git a/py/server/tests/test_udf_array_args.py b/py/server/tests/test_udf_array_args.py index bf75e92cb9e..0f03b802703 100644 --- a/py/server/tests/test_udf_array_args.py +++ b/py/server/tests/test_udf_array_args.py @@ -8,6 +8,7 @@ import numpy as np from deephaven import empty_table, DHError, dtypes +from deephaven.jcompat import dh_null_to_nan from tests.testbase import BaseTestCase from .test_udf_scalar_args import _J_TYPE_NP_DTYPE_MAP, _J_TYPE_NULL_MAP, _J_TYPE_J_ARRAY_TYPE_MAP @@ -100,21 +101,8 @@ def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool: """ exec(func_str, globals()) - # for floating point types, DH nulls are auto converted to np.nan - # for integer types, DH nulls in the array raise exceptions - if j_dtype in ("float", "double"): - res = tbl.update("Z = test_udf(X, Y)") - self.assertEqual(10, res.to_string().count("true")) - else: - res = tbl.update("Z = test_udf(X, Y)") - self.assertEqual(10, res.to_string().count("true")) - - # TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved - # with self.assertRaises(DHError) as cm: - # tbl.update("Z = test_udf(X, Y)") - # self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values, - # but numpy .* " - # "array does not support ") + res = tbl.update("Z = test_udf(X, Y)") + self.assertEqual(10, res.to_string().count("true")) def test_np_object_array(self): with self.subTest("PyObject"): @@ -189,11 +177,6 @@ def test_udf(p1: np.ndarray[np.bool_], p2=None) -> bool: t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : null"]).group_by("X") t1 = t.update(["X1 = test_udf(Y)"]) self.assertEqual(t1.columns[2].data_type, dtypes.bool_) - # TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved - # with self.assertRaises(DHError) as cm: - # t1 = t.update(["X1 = test_udf(Y)"]) - # self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values, but numpy .* " - # "array does not support ") t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : false"]).group_by("X") t1 = t.update(["X1 = test_udf(Y)"]) self.assertEqual(t1.columns[2].data_type, dtypes.bool_) @@ -237,6 +220,55 @@ def test_udf(x, y: Union[{th}, np.ndarray[np.int64]]) -> bool: ["Z = test_udf(X, Y.toArray())"]) self.assertEqual(t.columns[2].data_type, dtypes.bool_) + def test_dh_null_conversion(self): + x_formula = "X = i % 10" + for j_dtype, null_name in _J_TYPE_NULL_MAP.items(): + y_formula = f"Y = i % 3 == 0? {null_name} : ({j_dtype})i" + with self.subTest(j_dtype): + tbl = empty_table(100).update([x_formula, y_formula]).group_by("X") + + func_str = f""" +def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool: + z = dh_null_to_nan(y, type_promotion=True) + check_y = (isinstance(x, int) and isinstance(y, np.ndarray) and y.dtype.type == +{_J_TYPE_NP_DTYPE_MAP[j_dtype]} and np.nanmean(y) == np.mean( y)) + check_z = np.any(np.isnan(z)) and (z.dtype.type == np.float64 if y.dtype.type not in {{np.float32, np.float64}} + else z.dtype == y.dtype) + return check_y and check_z + """ + exec(func_str, globals()) + + res = tbl.update("Z = test_udf(X, Y)") + self.assertEqual(10, res.to_string().count("true")) + + func_str = f""" +def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool: + z = dh_null_to_nan(y, type_promotion=False) + return True + """ + exec(func_str, globals()) + if j_dtype not in {"float", "double"}: + with self.assertRaises(DHError) as cm: + res = tbl.update("Z = test_udf(X, Y)") + self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False") + else: + res = tbl.update("Z = test_udf(X, Y)") + self.assertEqual(10, res.to_string().count("true")) + + + with self.subTest("boolean"): + def test_udf(p1: np.ndarray[np.bool_], p2=None, tp: bool = True) -> bool: + z = dh_null_to_nan(p1, type_promotion=tp) + return z.dtype.type == np.float64 and np.any(np.isnan(z)) + + t = empty_table(100).update(["X = i % 10", "Y = i % 3 == 0? true : null"]).group_by("X") + rest = t.update(["X1 = test_udf(Y)"]) + self.assertEqual(10, res.to_string().count("true")) + + with self.assertRaises(DHError) as cm: + t.update(["X1 = test_udf(Y, null, false)"]) + self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False") + if __name__ == "__main__": unittest.main() diff --git a/py/server/tests/test_udf_scalar_args.py b/py/server/tests/test_udf_scalar_args.py index cae44e8757c..0dc6fecb6b1 100644 --- a/py/server/tests/test_udf_scalar_args.py +++ b/py/server/tests/test_udf_scalar_args.py @@ -408,7 +408,7 @@ def test_udf(p1: int, p2: float, kw1: str) -> bool: with self.assertRaises(DHError) as cm: t = empty_table(1).update("X = `1`").update("Y = test_udf(1, 1.0, X = `1`)") - self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got boolean") + self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got class java.lang.Boolean") with self.subTest("with keyword only params"): def test_udf(p1: int, p2: float, *, kw1: str) -> bool: @@ -538,7 +538,7 @@ def f6(*args: np.int32, col2: np.ndarray[np.int32]) -> bool: with self.assertRaises(DHError) as cm: t1 = t.update(["X1 = f6(X, Y=null)"]) - self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got boolean") + self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got class java.lang.Boolean") with self.subTest("f7"): def f1(x: int) -> Optional[float]: