Skip to content

Commit

Permalink
Add helper function dh_null_to_nan for explicit null conv of array el…
Browse files Browse the repository at this point in the history
…ements by users (deephaven#5310)

* Add dh_nulls_to_nan for explicit conv by users

* Add input check for public API func

* Add tests

* Respond to review comments and fix a bug

* Make the default for type_promotion False
  • Loading branch information
jmao-denver authored Apr 15, 2024
1 parent c2017ce commit f1316fa
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ public void verifyArguments(Class<?>[] argTypes) {
StringBuilder argTypesStr = new StringBuilder();
for (int i = 0; i < argTypes.length; i++) {
Class<?> argType = argTypes[i];
argType = argType == boolean.class ? Boolean.class : argType;

// if there are more arguments than parameters, we'll need to consider the last parameter as a varargs
// parameter. This is not ideal. We should look for a better way to handle this, i.e. a way to convey that
Expand Down
71 changes: 44 additions & 27 deletions py/server/deephaven/jcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
""" This module provides Java compatibility support including convenience functions to create some widely used Java
data structures from corresponding Python ones in order to be able to call Java methods. """

from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Tuple, Literal, Optional
from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Optional

import jpy
import numpy as np
import pandas as pd

from deephaven import dtypes, DHError
from deephaven._wrapper import unwrap, wrap_j_object, JObjectWrapper
from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP, _J_ARRAY_NP_TYPE_MAP
from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP

_NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE
_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility")
Expand Down Expand Up @@ -216,14 +216,8 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t
dtype (DType): The dtype of the Java array
j_array (jpy.JType): The Java array to convert
conv_null (bool): If True, convert nulls to the null value for the dtype
type_promotion (bool): Ignored when conv_null is False. When type_promotion is False, (1) input Java integer,
boolean, or character arrays containing Deephaven nulls yield an exception, (2) input Java float or double
arrays containing Deephaven nulls have null values converted to np.nan, and (3) input Java arrays without
Deephaven nulls are converted to the target type. When type_promotion is True, (1) input Java integer,
boolean, or character arrays containing Deephaven nulls are converted to np.float64 arrays and Deephaven
null values are converted to np.nan, (2) input Java float or double arrays containing Deephaven nulls have
null values converted to np.nan, and (3) input Java arrays without Deephaven nulls are converted to the
target type. Defaults to False.
type_promotion (bool): Ignored when conv_null is False. When conv_null is True, see the description for the same
named parameter in dh_nulls_to_nan().
Returns:
np.ndarray: The numpy array or None if the Java array is None
Expand Down Expand Up @@ -255,26 +249,49 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t
np_array = np.array(j_array, np.object_)

if conv_null:
if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
if dtype in (dtypes.float32, dtypes.float64):
np_array = np.copy(np_array)
np_array[np_array == dh_null] = np.nan
else:
if dtype is dtypes.bool_: # needs to change its type to byte for dh null detection
np_array = np.frombuffer(np_array, np.byte)

if any(np_array[np_array == dh_null]):
if not type_promotion:
raise DHError(f"Problem creating numpy array. Java {dtype} array contains Deephaven null values, but numpy {np_array.dtype} array does not support null values")
np_array = np_array.astype(np.float64)
np_array[np_array == dh_null] = np.nan
else:
if dtype is dtypes.bool_: # needs to change its type back to bool
np_array = np.frombuffer(np_array, np.bool_)
return np_array
return dh_null_to_nan(np_array, type_promotion)

return np_array

def dh_null_to_nan(np_array: np.ndarray, type_promotion: bool = False) -> np.ndarray:
"""Converts Deephaven primitive null values in the given numpy array to np.nan. No conversion is performed on
non-primitive types.
Note, the input numpy array is modified in place if it is of a float or double type. If that's not a desired behavior,
pass a copy of the array instead. For input arrays of other types, a new array is always returned.
Args:
np_array (np.ndarray): The numpy array to convert
type_promotion (bool): When False, integer, boolean, or character arrays will cause an exception to be raised.
When True, integer, boolean, or character arrays are converted to new np.float64 arrays and Deephaven null
values in them are converted to np.nan. Numpy arrays of float or double types are not affected by this flag
and Deephaven nulls will always be converted to np.nan in place. Defaults to False.
Returns:
np.ndarray: The numpy array with Deephaven nulls converted to np.nan.
Raises:
DHError
"""
if not isinstance(np_array, np.ndarray):
raise DHError(message="The given np_array argument is not a numpy array.")

dtype = dtypes.from_np_dtype(np_array.dtype)
if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
if dtype in (dtypes.float32, dtypes.float64):
np_array = np.copy(np_array)
np_array[np_array == dh_null] = np.nan
else:
if not type_promotion:
raise DHError(message=f"failed to convert DH nulls to np.nan in the numpy array. The array is "
f"of {np_array.dtype.type} type but type_promotion is False")
if dtype is dtypes.bool_: # needs to change its type to byte for dh null detection
np_array = np.frombuffer(np_array, np.byte)

np_array = np_array.astype(np.float64)
np_array[np_array == dh_null] = np.nan

return np_array

def _j_array_to_series(dtype: DType, j_array: jpy.JType, conv_null: bool) -> pd.Series:
"""Produce a copy of the specified Java array as a pandas.Series object.
Expand Down
72 changes: 52 additions & 20 deletions py/server/tests/test_udf_array_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np

from deephaven import empty_table, DHError, dtypes
from deephaven.jcompat import dh_null_to_nan
from tests.testbase import BaseTestCase
from .test_udf_scalar_args import _J_TYPE_NP_DTYPE_MAP, _J_TYPE_NULL_MAP, _J_TYPE_J_ARRAY_TYPE_MAP

Expand Down Expand Up @@ -100,21 +101,8 @@ def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
"""
exec(func_str, globals())

# for floating point types, DH nulls are auto converted to np.nan
# for integer types, DH nulls in the array raise exceptions
if j_dtype in ("float", "double"):
res = tbl.update("Z = test_udf(X, Y)")
self.assertEqual(10, res.to_string().count("true"))
else:
res = tbl.update("Z = test_udf(X, Y)")
self.assertEqual(10, res.to_string().count("true"))

# TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved
# with self.assertRaises(DHError) as cm:
# tbl.update("Z = test_udf(X, Y)")
# self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values,
# but numpy .* "
# "array does not support ")
res = tbl.update("Z = test_udf(X, Y)")
self.assertEqual(10, res.to_string().count("true"))

def test_np_object_array(self):
with self.subTest("PyObject"):
Expand Down Expand Up @@ -189,11 +177,6 @@ def test_udf(p1: np.ndarray[np.bool_], p2=None) -> bool:
t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : null"]).group_by("X")
t1 = t.update(["X1 = test_udf(Y)"])
self.assertEqual(t1.columns[2].data_type, dtypes.bool_)
# TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved
# with self.assertRaises(DHError) as cm:
# t1 = t.update(["X1 = test_udf(Y)"])
# self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values, but numpy .* "
# "array does not support ")
t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : false"]).group_by("X")
t1 = t.update(["X1 = test_udf(Y)"])
self.assertEqual(t1.columns[2].data_type, dtypes.bool_)
Expand Down Expand Up @@ -237,6 +220,55 @@ def test_udf(x, y: Union[{th}, np.ndarray[np.int64]]) -> bool:
["Z = test_udf(X, Y.toArray())"])
self.assertEqual(t.columns[2].data_type, dtypes.bool_)

def test_dh_null_conversion(self):
x_formula = "X = i % 10"
for j_dtype, null_name in _J_TYPE_NULL_MAP.items():
y_formula = f"Y = i % 3 == 0? {null_name} : ({j_dtype})i"
with self.subTest(j_dtype):
tbl = empty_table(100).update([x_formula, y_formula]).group_by("X")

func_str = f"""
def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
z = dh_null_to_nan(y, type_promotion=True)
check_y = (isinstance(x, int) and isinstance(y, np.ndarray) and y.dtype.type ==
{_J_TYPE_NP_DTYPE_MAP[j_dtype]} and np.nanmean(y) == np.mean( y))
check_z = np.any(np.isnan(z)) and (z.dtype.type == np.float64 if y.dtype.type not in {{np.float32, np.float64}}
else z.dtype == y.dtype)
return check_y and check_z
"""
exec(func_str, globals())

res = tbl.update("Z = test_udf(X, Y)")
self.assertEqual(10, res.to_string().count("true"))

func_str = f"""
def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
z = dh_null_to_nan(y, type_promotion=False)
return True
"""
exec(func_str, globals())
if j_dtype not in {"float", "double"}:
with self.assertRaises(DHError) as cm:
res = tbl.update("Z = test_udf(X, Y)")
self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False")
else:
res = tbl.update("Z = test_udf(X, Y)")
self.assertEqual(10, res.to_string().count("true"))


with self.subTest("boolean"):
def test_udf(p1: np.ndarray[np.bool_], p2=None, tp: bool = True) -> bool:
z = dh_null_to_nan(p1, type_promotion=tp)
return z.dtype.type == np.float64 and np.any(np.isnan(z))

t = empty_table(100).update(["X = i % 10", "Y = i % 3 == 0? true : null"]).group_by("X")
rest = t.update(["X1 = test_udf(Y)"])
self.assertEqual(10, res.to_string().count("true"))

with self.assertRaises(DHError) as cm:
t.update(["X1 = test_udf(Y, null, false)"])
self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False")


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions py/server/tests/test_udf_scalar_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ def test_udf(p1: int, p2: float, kw1: str) -> bool:

with self.assertRaises(DHError) as cm:
t = empty_table(1).update("X = `1`").update("Y = test_udf(1, 1.0, X = `1`)")
self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got boolean")
self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got class java.lang.Boolean")

with self.subTest("with keyword only params"):
def test_udf(p1: int, p2: float, *, kw1: str) -> bool:
Expand Down Expand Up @@ -538,7 +538,7 @@ def f6(*args: np.int32, col2: np.ndarray[np.int32]) -> bool:

with self.assertRaises(DHError) as cm:
t1 = t.update(["X1 = f6(X, Y=null)"])
self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got boolean")
self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got class java.lang.Boolean")

with self.subTest("f7"):
def f1(x: int) -> Optional[float]:
Expand Down

0 comments on commit f1316fa

Please sign in to comment.