findepi commented on code in PR #16681: URL: https://github.com/apache/datafusion/pull/16681#discussion_r2206861910
########## datafusion/expr/src/udf.rs: ########## @@ -696,16 +700,89 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// Return true if this scalar UDF is equal to the other. /// - /// Allows customizing the equality of scalar UDFs. - /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]: + /// This method allows customizing the equality of scalar UDFs. It must adhere to the rules of equivalence: + /// + /// - Reflexive: `a.equals(a)` must return true. + /// - Symmetric: `a.equals(b)` implies `b.equals(a)`. + /// - Transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`. + /// + /// # Default Behavior + /// By default, this method compares the type IDs, names, and signatures of the two UDFs. If these match, + /// the method assumes the UDFs are not equal unless their pointers are the same. This conservative approach + /// ensures that different instances of the same function type are not mistakenly considered equal. /// - /// - reflexive: `a.equals(a)`; - /// - symmetric: `a.equals(b)` implies `b.equals(a)`; - /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`. + /// # Custom Implementation + /// If a UDF has internal state or additional properties that should be considered for equality, this method + /// should be overridden. For example, a UDF with parameters might compare those parameters in addition to + /// the default checks. + /// + /// # Example + /// ```rust + /// use std::any::Any; + /// use std::hash::{DefaultHasher, Hash, Hasher}; + /// use arrow::datatypes::DataType; + /// use datafusion_common::{not_impl_err, Result}; + /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature}; + /// + /// #[derive(Debug, PartialEq)] + /// struct MyUdf { + /// param: i32, + /// signature: Signature, + /// } /// - /// By default, compares [`Self::name`] and [`Self::signature`]. + /// impl ScalarUDFImpl for MyUdf { + /// fn as_any(&self) -> &dyn Any { + /// self + /// } + /// fn name(&self) -> &str { + /// "my_udf" + /// } + /// fn signature(&self) -> &Signature { + /// &self.signature + /// } + /// fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + /// Ok(DataType::Int32) + /// } + /// fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> { + /// not_impl_err!("not used") + /// } + /// fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + /// if let Some(other) = other.as_any().downcast_ref::<Self>() { + /// self == other + /// } else { + /// false + /// } + /// } + /// fn hash_value(&self) -> u64 { + /// let mut hasher = DefaultHasher::new(); + /// self.param.hash(&mut hasher); + /// self.name().hash(&mut hasher); + /// hasher.finish() + /// } + /// } + /// ``` + /// + /// # Notes + /// - This method must be consistent with [`Self::hash_value`]. If `equals` returns true for two UDFs, + /// their hash values must also be the same. + /// - Ensure that the implementation does not panic or cause undefined behavior for any input. fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - self.name() == other.name() && self.signature() == other.signature() + // if the pointers are the same, the UDFs are the same + if std::ptr::eq(self.as_any(), other.as_any()) { Review Comment: Could this be `std::ptr::eq(self, other)` here? ########## datafusion/expr/src/udf.rs: ########## @@ -696,16 +696,81 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// Return true if this scalar UDF is equal to the other. /// - /// Allows customizing the equality of scalar UDFs. - /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]: + /// This method allows customizing the equality of scalar UDFs. It must adhere to the rules of equivalence: /// - /// - reflexive: `a.equals(a)`; - /// - symmetric: `a.equals(b)` implies `b.equals(a)`; - /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`. + /// - Reflexive: `a.equals(a)` must return true. + /// - Symmetric: `a.equals(b)` implies `b.equals(a)`. + /// - Transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`. /// - /// By default, compares [`Self::name`] and [`Self::signature`]. + /// # Default Behavior + /// By default, this method compares the type IDs, names, and signatures of the two UDFs. If these match, + /// the method assumes the UDFs are not equal unless their pointers are the same. This conservative approach + /// ensures that different instances of the same function type are not mistakenly considered equal. + /// + /// # Custom Implementation + /// If a UDF has internal state or additional properties that should be considered for equality, this method + /// should be overridden. For example, a UDF with parameters might compare those parameters in addition to + /// the default checks. + /// + /// # Example + /// ```rust + /// use std::any::Any; + /// use std::hash::{DefaultHasher, Hash, Hasher}; + /// use arrow::datatypes::DataType; + /// use datafusion_common::{not_impl_err, Result}; + /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature}; + /// #[derive(Debug)] + /// struct MyUdf { + /// param: i32, + /// signature: Signature, + /// } + /// + /// impl ScalarUDFImpl for MyUdf { + /// fn as_any(&self) -> &dyn Any { + /// self + /// } + /// fn name(&self) -> &str { + /// "my_udf" + /// } + /// fn signature(&self) -> &Signature { + /// &self.signature + /// } + /// fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + /// Ok(DataType::Int32) + /// } + /// fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> { + /// not_impl_err!("not used") + /// } + /// fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + /// if let Some(other) = other.as_any().downcast_ref::<Self>() { + /// self.param == other.param && self.name() == other.name() + /// } else { + /// false + /// } + /// } + /// fn hash_value(&self) -> u64 { + /// let mut hasher = DefaultHasher::new(); + /// self.param.hash(&mut hasher); + /// self.name().hash(&mut hasher); + /// hasher.finish() + /// } + /// } + /// ``` + /// + /// # Notes + /// - This method must be consistent with [`Self::hash_value`]. If `equals` returns true for two UDFs, + /// their hash values must also be the same. + /// - Ensure that the implementation does not panic or cause undefined behavior for any input. fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - self.name() == other.name() && self.signature() == other.signature() + // if the pointers are the same, the UDFs are the same + if std::ptr::eq(self.as_any(), other.as_any()) { + return true; + } + + // if the types, names and signatures are the same, we can't know if they are the same so we + // assume they are not. + // If a UDF has internal state that should be compared, it should implement this method + false Review Comment: > // 1. Both objects to have identical TypeId we know typeid via as_any(), right? > // 2. Careful handling of potential padding bytes in structs i don't know if rust does something (zeros) with padding bytes > // 3. The concrete type to be safely comparable via memcmp rust struct is generally moveable around. i think the move semantics are generally about memcpy-ing bits to a new location, so memcmp-ing bits should be fine. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org