-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
A new interface for Scalar Functions #7978
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
|
||
//! Built-in functions module contains all the built-in functions definitions. | ||
|
||
use std::any::Any; | ||
use std::cmp::Ordering; | ||
use std::collections::HashMap; | ||
use std::fmt; | ||
|
@@ -28,8 +29,8 @@ use crate::signature::TIMEZONE_WILDCARD; | |
use crate::type_coercion::binary::get_wider_type; | ||
use crate::type_coercion::functions::data_types; | ||
use crate::{ | ||
conditional_expressions, struct_expressions, utils, FuncMonotonicity, Signature, | ||
TypeSignature, Volatility, | ||
conditional_expressions, struct_expressions, utils, FuncMonotonicity, | ||
FunctionReturnType, ScalarFunctionDef, Signature, TypeSignature, Volatility, | ||
}; | ||
|
||
use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit}; | ||
|
@@ -1550,6 +1551,46 @@ impl FromStr for BuiltinScalarFunction { | |
} | ||
} | ||
|
||
/// `ScalarFunctionDef` is the new interface for builtin scalar functions | ||
/// This is an adapter between the old and new interface, to use the new interface | ||
/// for internal execution. Functions are planned to move into new interface gradually | ||
/// The function body (`execute()` in `ScalarFunctionDef`) now are all defined in | ||
/// `physical-expr` crate, so the new interface implementation are defined separately | ||
/// in `BuiltinScalarFunctionWrapper` | ||
impl ScalarFunctionDef for BuiltinScalarFunction { | ||
fn as_any(&self) -> &dyn Any { | ||
self | ||
} | ||
|
||
fn name(&self) -> &[&str] { | ||
aliases(self) | ||
} | ||
|
||
fn input_type(&self) -> TypeSignature { | ||
self.signature().type_signature | ||
} | ||
|
||
fn return_type(&self) -> FunctionReturnType { | ||
let self_cloned = *self; | ||
let return_type_resolver = move |args: &[DataType]| -> Result<Arc<DataType>> { | ||
let result = BuiltinScalarFunction::return_type(self_cloned, args)?; | ||
Ok(Arc::new(result)) | ||
}; | ||
|
||
FunctionReturnType::LambdaReturnType(Arc::new(return_type_resolver)) | ||
} | ||
|
||
fn volatility(&self) -> Volatility { | ||
self.volatility() | ||
} | ||
|
||
fn monotonicity(&self) -> Option<FuncMonotonicity> { | ||
self.monotonicity() | ||
} | ||
|
||
// execution functions are defined in `BuiltinScalarFunctionWrapper` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All execution code for |
||
} | ||
|
||
/// Creates a function that returns the return type of a string function given | ||
/// the type of its first argument. | ||
/// | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ use crate::utils::{expr_to_columns, find_out_reference_exprs}; | |
use crate::window_frame; | ||
use crate::window_function; | ||
use crate::Operator; | ||
use crate::ScalarFunctionDef; | ||
use crate::{aggregate_function, ExprSchemable}; | ||
use arrow::datatypes::DataType; | ||
use datafusion_common::tree_node::{Transformed, TreeNode}; | ||
|
@@ -150,6 +151,9 @@ pub enum Expr { | |
Sort(Sort), | ||
/// Represents the call of a built-in scalar function with a set of arguments. | ||
ScalarFunction(ScalarFunction), | ||
/// Represents the call of a built-in scalar function with a set of arguments, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think about calling this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than add a parallel implementation I would love to just change
to something like
|
||
/// with new `ScalarFunctionDef` interface | ||
ScalarFunctionExpr(ScalarFunctionExpr), | ||
/// Represents the call of a user-defined scalar function with arguments. | ||
ScalarUDF(ScalarUDF), | ||
/// Represents the call of an aggregate built-in function with arguments. | ||
|
@@ -351,6 +355,38 @@ impl ScalarFunction { | |
} | ||
} | ||
|
||
/// scalar function expression for new `ScalarFunctionDef` interface | ||
#[derive(Clone, Debug)] | ||
pub struct ScalarFunctionExpr { | ||
/// The function | ||
pub fun: Arc<dyn ScalarFunctionDef>, | ||
/// List of expressions to feed to the functions as arguments | ||
pub args: Vec<Expr>, | ||
} | ||
|
||
impl Hash for ScalarFunctionExpr { | ||
fn hash<H: Hasher>(&self, state: &mut H) { | ||
self.fun.name().hash(state); | ||
self.fun.input_type().hash(state); | ||
} | ||
} | ||
|
||
impl Eq for ScalarFunctionExpr {} | ||
|
||
impl PartialEq for ScalarFunctionExpr { | ||
fn eq(&self, other: &Self) -> bool { | ||
self.fun.name() == other.fun.name() | ||
&& self.fun.input_type() == other.fun.input_type() | ||
} | ||
} | ||
|
||
impl ScalarFunctionExpr { | ||
/// Create a new ScalarFunctionExpr expression | ||
pub fn new(fun: Arc<dyn ScalarFunctionDef>, args: Vec<Expr>) -> Self { | ||
Self { fun, args } | ||
} | ||
} | ||
|
||
/// ScalarUDF expression | ||
#[derive(Clone, PartialEq, Eq, Hash, Debug)] | ||
pub struct ScalarUDF { | ||
|
@@ -731,6 +767,7 @@ impl Expr { | |
Expr::Placeholder(_) => "Placeholder", | ||
Expr::QualifiedWildcard { .. } => "QualifiedWildcard", | ||
Expr::ScalarFunction(..) => "ScalarFunction", | ||
Expr::ScalarFunctionExpr(..) => "ScalarFunctionExpr", | ||
Expr::ScalarSubquery { .. } => "ScalarSubquery", | ||
Expr::ScalarUDF(..) => "ScalarUDF", | ||
Expr::ScalarVariable(..) => "ScalarVariable", | ||
|
@@ -1177,6 +1214,9 @@ impl fmt::Display for Expr { | |
Expr::ScalarFunction(func) => { | ||
fmt_function(f, &func.fun.to_string(), false, &func.args, true) | ||
} | ||
Expr::ScalarFunctionExpr(func) => { | ||
fmt_function(f, func.fun.name()[0], false, &func.args, true) | ||
} | ||
Expr::ScalarUDF(ScalarUDF { fun, args }) => { | ||
fmt_function(f, &fun.name, false, args, true) | ||
} | ||
|
@@ -1511,6 +1551,9 @@ fn create_name(e: &Expr) -> Result<String> { | |
Expr::ScalarFunction(func) => { | ||
create_function_name(&func.fun.to_string(), false, &func.args) | ||
} | ||
Expr::ScalarFunctionExpr(func) => { | ||
create_function_name(func.fun.name()[0], false, &func.args) | ||
} | ||
Expr::ScalarUDF(ScalarUDF { fun, args }) => { | ||
create_function_name(&fun.name, false, args) | ||
} | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -17,12 +17,67 @@ | |||||
|
||||||
//! Udf module contains foundational types that are used to represent UDFs in DataFusion. | ||||||
|
||||||
use crate::{Expr, ReturnTypeFunction, ScalarFunctionImplementation, Signature}; | ||||||
use crate::{ | ||||||
ColumnarValue, Expr, FuncMonotonicity, ReturnTypeFunction, | ||||||
ScalarFunctionImplementation, Signature, TypeSignature, Volatility, | ||||||
}; | ||||||
use arrow::array::ArrayRef; | ||||||
use arrow::datatypes::DataType; | ||||||
use datafusion_common::{internal_err, DataFusionError, Result}; | ||||||
use std::any::Any; | ||||||
use std::fmt; | ||||||
use std::fmt::Debug; | ||||||
use std::fmt::Formatter; | ||||||
use std::sync::Arc; | ||||||
|
||||||
// TODO(PR): add doc comments | ||||||
pub trait ScalarFunctionDef: Any + Sync + Send + std::fmt::Debug { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think about calling this trait |
||||||
/// Return as [`Any`] so that it can be | ||||||
/// downcast to a specific implementation. | ||||||
fn as_any(&self) -> &dyn Any; | ||||||
|
||||||
// May return 1 or more name as aliasing | ||||||
fn name(&self) -> &[&str]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recommend
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This approach looks better 👍🏼 |
||||||
|
||||||
fn input_type(&self) -> TypeSignature; | ||||||
|
||||||
fn return_type(&self) -> FunctionReturnType; | ||||||
|
||||||
fn execute(&self, _args: &[ArrayRef]) -> Result<ArrayRef> { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should be
Suggested change
|
||||||
internal_err!("This method should be implemented if `supports_execute_raw()` returns `false`") | ||||||
} | ||||||
|
||||||
fn volatility(&self) -> Volatility; | ||||||
|
||||||
fn monotonicity(&self) -> Option<FuncMonotonicity>; | ||||||
|
||||||
// =============================== | ||||||
// OPTIONAL METHODS START BELOW | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This trait consists of mandatory and optional methods, it can get a bit lengthy... |
||||||
// =============================== | ||||||
|
||||||
/// `execute()` and `execute_raw()` are two possible alternative for function definition: | ||||||
/// If returns `false`, `execute()` will be used for execution; | ||||||
/// If returns `true`, `execute_raw()` will be called. | ||||||
fn use_execute_raw_instead(&self) -> bool { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rational for this:
Though a single There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should have a single There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree we should make the initial implementation concise |
||||||
false | ||||||
} | ||||||
|
||||||
/// An alternative function defination than `execute()` | ||||||
fn execute_raw(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> { | ||||||
internal_err!("This method should be implemented if `supports_execute_raw()` returns `true`") | ||||||
} | ||||||
} | ||||||
|
||||||
/// Defines the return type behavior of a function. | ||||||
pub enum FunctionReturnType { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now like 99% of built-in functions are either There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think about a signature like this: pub trait ScalarFunctionDef: Any + Sync + Send + std::fmt::Debug {
...
/// What type will this function return, given arguments of the specified input types?
/// By default, returns the same type as the first argument
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
arg_types.get(0)
.ok_or_else(Error ("Implementation of Function {} did not specify a return type, and there are no arguments"))
}
...
} Then I think most function implementations can be left as the default or as impl ScalarFunctionDef for Foo {
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
Ok(DataType::Utf8)
}
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The returning |
||||||
/// Matches the first argument's type. | ||||||
SameAsFirstArg, | ||||||
/// A predetermined type. | ||||||
FixedType(Arc<DataType>), | ||||||
/// Decided by a custom lambda function. | ||||||
LambdaReturnType(ReturnTypeFunction), | ||||||
} | ||||||
|
||||||
/// Logical representation of a UDF. | ||||||
#[derive(Clone)] | ||||||
pub struct ScalarUDF { | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we'll be able to
impl
this as long asBuiltInScalarFunction
is split across two crates.