2010YOUY01 commented on code in PR #7978: URL: https://github.com/apache/arrow-datafusion/pull/7978#discussion_r1375715372
########## datafusion/expr/src/udf.rs: ########## @@ -17,12 +17,67 @@ //! Udf module contains foundational types that are used to represent UDFs in DataFusion. -use crate::{Expr, ReturnTypeFunction, ScalarFunctionImplementation, Signature}; +use crate::{ + ColumnarValue, Expr, FuncMonotonicity, ReturnTypeFunction, + ScalarFunctionImplementation, Signature, TypeSignature, Volatility, +}; +use arrow::array::ArrayRef; +use arrow::datatypes::DataType; +use datafusion_common::{internal_err, DataFusionError, Result}; +use std::any::Any; use std::fmt; use std::fmt::Debug; use std::fmt::Formatter; use std::sync::Arc; +// TODO(PR): add doc comments +pub trait ScalarFunctionDef: Any + Sync + Send + std::fmt::Debug { + /// Return as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + // May return 1 or more name as aliasing + fn name(&self) -> &[&str]; + + fn input_type(&self) -> TypeSignature; + + fn return_type(&self) -> FunctionReturnType; + + fn execute(&self, _args: &[ArrayRef]) -> Result<ArrayRef> { + internal_err!("This method should be implemented if `supports_execute_raw()` returns `false`") + } + + fn volatility(&self) -> Volatility; + + fn monotonicity(&self) -> Option<FuncMonotonicity>; + + // =============================== + // OPTIONAL METHODS START BELOW + // =============================== + + /// `execute()` and `execute_raw()` are two possible alternative for function definition: + /// If returns `false`, `execute()` will be used for execution; + /// If returns `true`, `execute_raw()` will be called. + fn use_execute_raw_instead(&self) -> bool { + false + } + + /// An alternative function defination than `execute()` + fn execute_raw(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> { + internal_err!("This method should be implemented if `supports_execute_raw()` returns `true`") + } +} + +/// Defines the return type behavior of a function. +pub enum FunctionReturnType { Review Comment: Now like 99% of built-in functions are either `SameAsFirstArg` or `FixedType`, only very rare array functions can only be defined using lambda. This way can make the new interface a little bit easier to use. (also possible to extend to address https://github.com/apache/arrow-datafusion/discussions/7657) ########## datafusion/physical-expr/src/functions.rs: ########## @@ -45,142 +45,233 @@ use arrow::{ datatypes::{DataType, Int32Type, Int64Type, Schema}, }; use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; -pub use datafusion_expr::FuncMonotonicity; use datafusion_expr::{ - BuiltinScalarFunction, ColumnarValue, ScalarFunctionImplementation, + BuiltinScalarFunction, ColumnarValue, FunctionReturnType, + ScalarFunctionImplementation, }; +pub use datafusion_expr::{ + FuncMonotonicity, ReturnTypeFunction, ScalarFunctionDef, TypeSignature, Volatility, +}; +use std::any::Any; use std::ops::Neg; use std::sync::Arc; -/// Create a physical (function) expression. -/// This function errors when `args`' can't be coerced to a valid argument type of the function. -pub fn create_physical_expr( - fun: &BuiltinScalarFunction, - input_phy_exprs: &[Arc<dyn PhysicalExpr>], - input_schema: &Schema, - execution_props: &ExecutionProps, -) -> Result<Arc<dyn PhysicalExpr>> { - let input_expr_types = input_phy_exprs - .iter() - .map(|e| e.data_type(input_schema)) - .collect::<Result<Vec<_>>>()?; +/// `ScalarFunctionDef` is the new interface for builtin scalar functions +/// This is an adapter between the old and new interface, to use the new interface +/// for internal execution. Functions are planned to move into new interface gradually +#[derive(Debug, Clone)] +pub(crate) struct BuiltinScalarFunctionWrapper { + func: Arc<dyn ScalarFunctionDef>, + // functions like `now()` requires per-execution properties + execution_props: ExecutionProps, Review Comment: Few functions like `now()` require the core to pass some information to it, when migrating those functions, we can extend the `trait ScalarFunctionDef` with a optional method `set_execution_props(exec_props: ExecutionProps)`, as the mechanism to let core pass data to functions defined outside the core ########## datafusion/expr/src/udf.rs: ########## @@ -17,12 +17,67 @@ //! Udf module contains foundational types that are used to represent UDFs in DataFusion. -use crate::{Expr, ReturnTypeFunction, ScalarFunctionImplementation, Signature}; +use crate::{ + ColumnarValue, Expr, FuncMonotonicity, ReturnTypeFunction, + ScalarFunctionImplementation, Signature, TypeSignature, Volatility, +}; +use arrow::array::ArrayRef; +use arrow::datatypes::DataType; +use datafusion_common::{internal_err, DataFusionError, Result}; +use std::any::Any; use std::fmt; use std::fmt::Debug; use std::fmt::Formatter; use std::sync::Arc; +// TODO(PR): add doc comments +pub trait ScalarFunctionDef: Any + Sync + Send + std::fmt::Debug { + /// Return as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + // May return 1 or more name as aliasing + fn name(&self) -> &[&str]; + + fn input_type(&self) -> TypeSignature; + + fn return_type(&self) -> FunctionReturnType; + + fn execute(&self, _args: &[ArrayRef]) -> Result<ArrayRef> { + internal_err!("This method should be implemented if `supports_execute_raw()` returns `false`") + } + + fn volatility(&self) -> Volatility; + + fn monotonicity(&self) -> Option<FuncMonotonicity>; + + // =============================== + // OPTIONAL METHODS START BELOW Review Comment: This trait consists of mandatory and optional methods, it can get a bit lengthy... An alternative implementation is trait inheritance(e.g. `pub trait ScalarFunctionExtended: ScalarFunctionDef`, it seems won't be much clearer than the current one. We can add more docs/examples to make it more straightforward later ########## datafusion/expr/src/built_in_function.rs: ########## @@ -1550,6 +1551,46 @@ impl FromStr for BuiltinScalarFunction { } } +/// `ScalarFunctionDef` is the new interface for builtin scalar functions +/// This is an adapter between the old and new interface, to use the new interface +/// for internal execution. Functions are planned to move into new interface gradually +/// The function body (`execute()` in `ScalarFunctionDef`) now are all defined in +/// `physical-expr` crate, so the new interface implementation are defined separately +/// in `BuiltinScalarFunctionWrapper` +impl ScalarFunctionDef for BuiltinScalarFunction { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &[&str] { + aliases(self) + } + + fn input_type(&self) -> TypeSignature { + self.signature().type_signature + } + + fn return_type(&self) -> FunctionReturnType { + let self_cloned = *self; + let return_type_resolver = move |args: &[DataType]| -> Result<Arc<DataType>> { + let result = BuiltinScalarFunction::return_type(self_cloned, args)?; + Ok(Arc::new(result)) + }; + + FunctionReturnType::LambdaReturnType(Arc::new(return_type_resolver)) + } + + fn volatility(&self) -> Volatility { + self.volatility() + } + + fn monotonicity(&self) -> Option<FuncMonotonicity> { + self.monotonicity() + } + + // execution functions are defined in `BuiltinScalarFunctionWrapper` Review Comment: All execution code for `BuiltinScalarFunction` are in `phys-expr` crate (which depends on this crate), so they're defined elsewhere ########## datafusion/expr/src/udf.rs: ########## @@ -17,12 +17,67 @@ //! Udf module contains foundational types that are used to represent UDFs in DataFusion. -use crate::{Expr, ReturnTypeFunction, ScalarFunctionImplementation, Signature}; +use crate::{ + ColumnarValue, Expr, FuncMonotonicity, ReturnTypeFunction, + ScalarFunctionImplementation, Signature, TypeSignature, Volatility, +}; +use arrow::array::ArrayRef; +use arrow::datatypes::DataType; +use datafusion_common::{internal_err, DataFusionError, Result}; +use std::any::Any; use std::fmt; use std::fmt::Debug; use std::fmt::Formatter; use std::sync::Arc; +// TODO(PR): add doc comments +pub trait ScalarFunctionDef: Any + Sync + Send + std::fmt::Debug { + /// Return as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + // May return 1 or more name as aliasing + fn name(&self) -> &[&str]; + + fn input_type(&self) -> TypeSignature; + + fn return_type(&self) -> FunctionReturnType; + + fn execute(&self, _args: &[ArrayRef]) -> Result<ArrayRef> { + internal_err!("This method should be implemented if `supports_execute_raw()` returns `false`") + } + + fn volatility(&self) -> Volatility; + + fn monotonicity(&self) -> Option<FuncMonotonicity>; + + // =============================== + // OPTIONAL METHODS START BELOW + // =============================== + + /// `execute()` and `execute_raw()` are two possible alternative for function definition: + /// If returns `false`, `execute()` will be used for execution; + /// If returns `true`, `execute_raw()` will be called. + fn use_execute_raw_instead(&self) -> bool { Review Comment: Rational for this: built-in functions now have two kinds of implementation 1. `execute()` -- It's a more common and easy-to-implement interface for UDFs, and can be converted to the more general `execute_raw()` case using `make_scalar_function()` 2. `execute_raw()` -- Fewer existing functions are directly implemented using this interface Though a single `execute_raw()` can cover all existing cases, this design can make the general case easier to implement for UDFs -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org