apache · timsaucer · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py
@@ -29,7 +29,6 @@
     WindowFrame,
     column,
     literal,
-    udf,
 )
 from datafusion.expr import Window
 
@@ -236,21 +235,6 @@ def test_unnest_without_nulls(nested_df):
     assert result.column(1) == pa.array([7, 8, 8, 9, 9, 9])
 
 
-def test_udf(df):
-    # is_null is a pa function over arrays
-    is_null = udf(
-        lambda x: x.is_null(),
-        [pa.int64()],
-        pa.bool_(),
-        volatility="immutable",
-    )
-
-    df = df.select(is_null(column("a")))
-    result = df.collect()[0].column(0)
-
-    assert result == pa.array([False, False, False])
-
-
 def test_join():
     ctx = SessionContext()
 

diff --git a/python/datafusion/tests/test_udaf.py b/python/datafusion/tests/test_udaf.py
@@ -21,14 +21,14 @@
 import pyarrow.compute as pc
 import pytest
 
-from datafusion import Accumulator, column, udaf, udf
+from datafusion import Accumulator, column, udaf
 
 
 class Summarize(Accumulator):
     """Interface of a user-defined accumulation."""
 
-    def __init__(self):
-        self._sum = pa.scalar(0.0)
+    def __init__(self, initial_value: float = 0.0):
+        self._sum = pa.scalar(initial_value)
 
     def state(self) -> List[pa.Scalar]:
         return [self._sum]
@@ -97,7 +97,7 @@ def test_errors(df):
         df.collect()
 
 
-def test_aggregate(df):
+def test_udaf_aggregate(df):
     summarize = udaf(
         Summarize,
         pa.float64(),
@@ -106,13 +106,40 @@ def test_aggregate(df):
         volatility="immutable",
     )
 
-    df = df.aggregate([], [summarize(column("a"))])
+    df1 = df.aggregate([], [summarize(column("a"))])
 
     # execute and collect the first (and only) batch
-    result = df.collect()[0]
+    result = df1.collect()[0]
 
     assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])
 
+    df2 = df.aggregate([], [summarize(column("a"))])
+
+    # Run a second time to ensure the state is properly reset
+    result = df2.collect()[0]
+
+    assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])
+
+
+def test_udaf_aggregate_with_arguments(df):
+    bias = 10.0
+
+    summarize = udaf(
+        Summarize,
+        pa.float64(),
+        pa.float64(),
+        [pa.float64()],
+        volatility="immutable",
+        arguments=[bias],
+    )
+
+    df1 = df.aggregate([], [summarize(column("a"))])
+
+    # execute and collect the first (and only) batch
+    result = df1.collect()[0]
+
+    assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0])
+
 
 def test_group_by(df):
     summarize = udaf(
@@ -146,20 +173,3 @@ def test_register_udaf(ctx, df) -> None:
     df_result = ctx.sql("select summarize(b) from test_table")
 
     assert df_result.collect()[0][0][0].as_py() == 14.0
-
-
-def test_register_udf(ctx, df) -> None:
-    is_null = udf(
-        lambda x: x.is_null(),
-        [pa.float64()],
-        pa.bool_(),
-        volatility="immutable",
-        name="is_null",
-    )
-
-    ctx.register_udf(is_null)
-
-    df_result = ctx.sql("select is_null(a) from test_table")
-    result = df_result.collect()[0].column(0)
-
-    assert result == pa.array([False, False, False])
diff --git a/python/datafusion/tests/test_udf.py b/python/datafusion/tests/test_udf.py
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from datafusion import udf, column
+import pyarrow as pa
+import pytest
+
+
+@pytest.fixture
+def df(ctx):
+    # create a RecordBatch and a new DataFrame from it
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 4, 6])],
+        names=["a", "b"],
+    )
+    return ctx.create_dataframe([[batch]], name="test_table")
+
+
+def test_udf(df):
+    # is_null is a pa function over arrays
+    is_null = udf(
+        lambda x: x.is_null(),
+        [pa.int64()],
+        pa.bool_(),
+        volatility="immutable",
+    )
+
+    df = df.select(is_null(column("a")))
+    result = df.collect()[0].column(0)
+
+    assert result == pa.array([False, False, False])
+
+
+def test_register_udf(ctx, df) -> None:
+    is_null = udf(
+        lambda x: x.is_null(),
+        [pa.float64()],
+        pa.bool_(),
+        volatility="immutable",
+        name="is_null",
+    )
+
+    ctx.register_udf(is_null)
+
+    df_result = ctx.sql("select is_null(a) from test_table")
+    result = df_result.collect()[0].column(0)
+
+    assert result == pa.array([False, False, False])
+
+
+class OverThresholdUDF:
+    def __init__(self, threshold: int = 0) -> None:
+        self.threshold = threshold
+
+    def __call__(self, values: pa.Array) -> pa.Array:
+        return pa.array(v.as_py() >= self.threshold for v in values)
+
+
+def test_udf_with_parameters(df) -> None:
+    udf_no_param = udf(
+        OverThresholdUDF(),
+        pa.int64(),
+        pa.bool_(),
+        volatility="immutable",
+    )
+
+    df1 = df.select(udf_no_param(column("a")))
+    result = df1.collect()[0].column(0)
+
+    assert result == pa.array([True, True, True])
+
+    udf_with_param = udf(
+        OverThresholdUDF(2),
+        pa.int64(),
+        pa.bool_(),
+        volatility="immutable",
+    )
+
+    df2 = df.select(udf_with_param(column("a")))
+    result = df2.collect()[0].column(0)
+
+    assert result == pa.array([False, True, True])
diff --git a/python/datafusion/tests/test_udwf.py b/python/datafusion/tests/test_udwf.py
@@ -24,7 +24,7 @@
 
 
 class ExponentialSmoothDefault(WindowEvaluator):
-    def __init__(self, alpha: float) -> None:
+    def __init__(self, alpha: float = 0.8) -> None:
         self.alpha = alpha
 
     def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array:
@@ -183,46 +183,58 @@ def df():
 def test_udwf_errors(df):
     with pytest.raises(TypeError):
         udwf(
-            NotSubclassOfWindowEvaluator(),
+            NotSubclassOfWindowEvaluator,
             pa.float64(),
             pa.float64(),
             volatility="immutable",
         )
 
 
 smooth_default = udwf(
-    ExponentialSmoothDefault(0.9),
+    ExponentialSmoothDefault,
+    pa.float64(),
+    pa.float64(),
+    volatility="immutable",
+    arguments=[0.9],
+)
+
+smooth_no_arugments = udwf(
+    ExponentialSmoothDefault,
     pa.float64(),
     pa.float64(),
     volatility="immutable",
 )
 
 smooth_bounded = udwf(
-    ExponentialSmoothBounded(0.9),
+    ExponentialSmoothBounded,
     pa.float64(),
     pa.float64(),
     volatility="immutable",
+    arguments=[0.9],
 )
 
 smooth_rank = udwf(
-    ExponentialSmoothRank(0.9),
+    ExponentialSmoothRank,
     pa.utf8(),
     pa.float64(),
     volatility="immutable",
+    arguments=[0.9],
 )
 
 smooth_frame = udwf(
-    ExponentialSmoothFrame(0.9),
+    ExponentialSmoothFrame,
     pa.float64(),
     pa.float64(),
     volatility="immutable",
+    arguments=[0.9],
 )
 
 smooth_two_col = udwf(
-    SmoothTwoColumn(0.9),
+    SmoothTwoColumn,
     [pa.int64(), pa.int64()],
     pa.float64(),
     volatility="immutable",
+    arguments=[0.9],
 )
 
 data_test_udwf_functions = [
@@ -231,6 +243,11 @@ def test_udwf_errors(df):
         smooth_default(column("a")),
         [0, 0.9, 1.89, 2.889, 3.889, 4.889, 5.889],
     ),
+    (
+        "default_udwf_no_arguments",
+        smooth_no_arugments(column("a")),
+        [0, 0.8, 1.76, 2.752, 3.75, 4.75, 5.75],
+    ),
     (
         "default_udwf_partitioned",
         smooth_default(column("a")).partition_by(column("c")).build(),