Jefffrey commented on code in PR #20595: URL: https://github.com/apache/datafusion/pull/20595#discussion_r3185738954
########## datafusion/spark/src/function/math/isnan.rs: ########## @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use arrow::datatypes::{ArrowPrimitiveType, DataType, Float32Type, Float64Type}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use num_traits::Float; + +/// Spark-compatible `isnan` expression +/// <https://spark.apache.org/docs/latest/api/sql/index.html#isnan> +/// +/// Differences with DataFusion isnan: +/// - Spark returns `false` for NULL inputs; DataFusion propagates NULL +/// - Spark only accepts Float32 and Float64; DataFusion accepts all numeric +/// types (returning false for integers and decimals, which are never NaN) +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkIsNaN { + signature: Signature, +} + +impl Default for SparkIsNaN { + fn default() -> Self { + Self::new() + } +} + +impl SparkIsNaN { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Float32]), + TypeSignature::Exact(vec![DataType::Float64]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkIsNaN { + fn name(&self) -> &str { + "isnan" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { Review Comment: I think we should check which nullability should be here; by default it is `true` ########## datafusion/spark/src/function/math/isnan.rs: ########## @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use arrow::datatypes::{ArrowPrimitiveType, DataType, Float32Type, Float64Type}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use num_traits::Float; + +/// Spark-compatible `isnan` expression +/// <https://spark.apache.org/docs/latest/api/sql/index.html#isnan> +/// +/// Differences with DataFusion isnan: +/// - Spark returns `false` for NULL inputs; DataFusion propagates NULL +/// - Spark only accepts Float32 and Float64; DataFusion accepts all numeric +/// types (returning false for integers and decimals, which are never NaN) +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkIsNaN { + signature: Signature, +} + +impl Default for SparkIsNaN { + fn default() -> Self { + Self::new() + } +} + +impl SparkIsNaN { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Float32]), + TypeSignature::Exact(vec![DataType::Float64]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkIsNaN { + fn name(&self) -> &str { + "isnan" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + spark_isnan(&args.args) + } +} + +fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue> { + let [array] = take_function_args("isnan", ColumnarValue::values_to_arrays(args)?)?; + + let result = match array.data_type() { + DataType::Float32 => isnan_array::<Float32Type>(&array), + DataType::Float64 => isnan_array::<Float64Type>(&array), + other => return exec_err!("Unsupported data type {other:?} for function isnan"), + }; + Ok(ColumnarValue::Array(result)) +} Review Comment: ```suggestion fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { let [array] = take_function_args("isnan", &args.args)?; let array = array.to_array_of_size(args.number_rows)?; let result = match array.data_type() { DataType::Float32 => isnan_array::<Float32Type>(&array), DataType::Float64 => isnan_array::<Float64Type>(&array), other => { return exec_err!("Unsupported data type {other:?} for function isnan"); } }; Ok(ColumnarValue::Array(result)) } } ``` Might as well inline this, and also no need to go through `ColumnarValue::values_to_arrays` since we expect a single argument ########## datafusion/spark/src/function/math/isnan.rs: ########## @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use arrow::datatypes::{ArrowPrimitiveType, DataType, Float32Type, Float64Type}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use num_traits::Float; + +/// Spark-compatible `isnan` expression +/// <https://spark.apache.org/docs/latest/api/sql/index.html#isnan> +/// +/// Differences with DataFusion isnan: +/// - Spark returns `false` for NULL inputs; DataFusion propagates NULL +/// - Spark only accepts Float32 and Float64; DataFusion accepts all numeric +/// types (returning false for integers and decimals, which are never NaN) +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkIsNaN { + signature: Signature, +} + +impl Default for SparkIsNaN { + fn default() -> Self { + Self::new() + } +} + +impl SparkIsNaN { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Float32]), + TypeSignature::Exact(vec![DataType::Float64]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkIsNaN { + fn name(&self) -> &str { + "isnan" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + spark_isnan(&args.args) + } +} + +fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue> { + let [array] = take_function_args("isnan", ColumnarValue::values_to_arrays(args)?)?; + + let result = match array.data_type() { + DataType::Float32 => isnan_array::<Float32Type>(&array), + DataType::Float64 => isnan_array::<Float64Type>(&array), + other => return exec_err!("Unsupported data type {other:?} for function isnan"), + }; + Ok(ColumnarValue::Array(result)) +} + +fn isnan_array<T>(array: &ArrayRef) -> ArrayRef +where + T: ArrowPrimitiveType, + T::Native: Float, +{ + let array = array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap(); + nulls_to_false(BooleanArray::from_unary(array, |x| x.is_nan())) +} + +/// Replaces null values with false in a BooleanArray. +/// +/// Spark's `isnan` returns `false` for NULL inputs rather than propagating NULL. +fn nulls_to_false(is_nan: BooleanArray) -> ArrayRef { + match is_nan.nulls() { + Some(nulls) => { + let is_not_null = nulls.inner(); + Arc::new(BooleanArray::new(is_nan.values() & is_not_null, None)) + } + None => Arc::new(is_nan), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{Float32Array, Float64Array}; + use datafusion_common::ScalarValue; + + #[test] + fn test_isnan_float64() { Review Comment: Could we move all these tests to SLTs? ########## datafusion/spark/src/function/math/isnan.rs: ########## @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use arrow::datatypes::{ArrowPrimitiveType, DataType, Float32Type, Float64Type}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use num_traits::Float; + +/// Spark-compatible `isnan` expression +/// <https://spark.apache.org/docs/latest/api/sql/index.html#isnan> +/// +/// Differences with DataFusion isnan: +/// - Spark returns `false` for NULL inputs; DataFusion propagates NULL +/// - Spark only accepts Float32 and Float64; DataFusion accepts all numeric +/// types (returning false for integers and decimals, which are never NaN) Review Comment: This seems odd; testing on PySpark: ```python >>> spark.version '4.1.1' >>> spark.sql("select isnan(1::int)").show() +---------------------+ |isnan(CAST(1 AS INT))| +---------------------+ | false| +---------------------+ ``` Where is this expected behaviour for Spark coming from? ########## datafusion/spark/src/function/math/isnan.rs: ########## @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use arrow::datatypes::{ArrowPrimitiveType, DataType, Float32Type, Float64Type}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use num_traits::Float; + +/// Spark-compatible `isnan` expression +/// <https://spark.apache.org/docs/latest/api/sql/index.html#isnan> +/// +/// Differences with DataFusion isnan: +/// - Spark returns `false` for NULL inputs; DataFusion propagates NULL +/// - Spark only accepts Float32 and Float64; DataFusion accepts all numeric +/// types (returning false for integers and decimals, which are never NaN) +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkIsNaN { + signature: Signature, +} + +impl Default for SparkIsNaN { + fn default() -> Self { + Self::new() + } +} + +impl SparkIsNaN { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Float32]), + TypeSignature::Exact(vec![DataType::Float64]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkIsNaN { + fn name(&self) -> &str { + "isnan" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + spark_isnan(&args.args) + } +} + +fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue> { + let [array] = take_function_args("isnan", ColumnarValue::values_to_arrays(args)?)?; + + let result = match array.data_type() { + DataType::Float32 => isnan_array::<Float32Type>(&array), + DataType::Float64 => isnan_array::<Float64Type>(&array), + other => return exec_err!("Unsupported data type {other:?} for function isnan"), + }; + Ok(ColumnarValue::Array(result)) +} + +fn isnan_array<T>(array: &ArrayRef) -> ArrayRef +where + T: ArrowPrimitiveType, + T::Native: Float, +{ + let array = array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap(); Review Comment: ```suggestion let array = array.as_primitive::<T>(); ``` ########## datafusion/sqllogictest/test_files/spark/math/isnan.slt: ########## @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This file is part of the implementation of the datafusion-spark function library. +# For more information, please see: +# https://github.com/apache/datafusion/issues/15914 + +# Tests for Spark-compatible isnan function. +# Spark semantics differ from DataFusion's built-in isnan: +# Spark returns false for NULL inputs; DataFusion returns NULL. +# +# Example: SELECT isnan(NULL::DOUBLE) +# Spark: returns false +# DataFusion: returns NULL + +# Scalar input: float64 +query BBBBB +SELECT isnan(1.0::DOUBLE), isnan('NaN'::DOUBLE), isnan('inf'::DOUBLE), isnan(0.0::DOUBLE), isnan(-1.0::DOUBLE); +---- +false true false false false + +# Scalar input: float64 NULL returns false (Spark behaviour, not NULL) +query B +SELECT isnan(NULL::DOUBLE); +---- +false + +# Scalar input: float32 +query BBBBB +SELECT isnan(1.0::FLOAT), isnan('NaN'::FLOAT), isnan('inf'::FLOAT), isnan(0.0::FLOAT), isnan(-1.0::FLOAT); +---- +false true false false false + +# Scalar input: float32 NULL returns false +query B +SELECT isnan(NULL::FLOAT); +---- +false + +# Array input: float64 — NULL produces false, not NULL +query B +SELECT isnan(a) FROM (VALUES (1.0::DOUBLE), ('NaN'::DOUBLE), (NULL::DOUBLE), ('inf'::DOUBLE), (0.0::DOUBLE)) AS t(a); +---- +false +true +false +false +false + +# Array input: float32 +query B +SELECT isnan(a) FROM (VALUES (1.0::FLOAT), ('NaN'::FLOAT), (NULL::FLOAT), ('inf'::FLOAT), (0.0::FLOAT)) AS t(a); +---- +false +true +false +false +false + +# Spark's isnan accepts any numeric type by implicitly casting to double, so +# integers, bigints, and decimals all return false (they can never be NaN). +# This test guards against the signature being accidentally narrowed in a way +# that would diverge from Spark. +query BBBB Review Comment: This conflicts with the docstring stating Spark only accepts f32/f64 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
