This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 097f04c2ec fix(spark): handle divide-by-zero in Spark `mod`/`pmod`
with ANSI mode support (#20461)
097f04c2ec is described below
commit 097f04c2ec728791683b00f017831368829dac86
Author: David López <[email protected]>
AuthorDate: Mon Mar 9 21:49:58 2026 +0100
fix(spark): handle divide-by-zero in Spark `mod`/`pmod` with ANSI mode
support (#20461)
## Which issue does this PR close?
- NA.
## Rationale for this change
Spark's `mod` and `pmod` functions return `NULL` on integer division by
zero in legacy mode (ANSI off), but DataFusion's implementation always
threw a `DivideByZero` error regardless of the ANSI mode setting.
## What changes are included in this PR?
Add ANSI mode support to `spark_mod` and `spark_pmod` via
enable_ansi_mode config option
In legacy mode (ANSI off): division by zero returns `NULL` per-element
In ANSI mode (ANSI on): division by zero throws an error (unchanged
behavior)
Add `try_rem` helper that handles per-element zero-divisor masking for
integer arrays
## Are these changes tested?
Yes:
- 18 unit tests in modulus.rs (including new tests for both ANSI modes)
- Updated pmod.slt and mod.slt sqllogictests with ANSI on/off coverage
## Are there any user-facing changes?
Yes — mod(10, 0) and pmod(10, 0) now return NULL instead of erroring
when enable_ansi_mode = false (the default), matching Spark behavior.
---
datafusion/spark/src/function/math/modulus.rs | 144 ++++++++++++++++-----
.../sqllogictest/test_files/spark/math/mod.slt | 29 +++++
.../sqllogictest/test_files/spark/math/pmod.slt | 22 +++-
3 files changed, 164 insertions(+), 31 deletions(-)
diff --git a/datafusion/spark/src/function/math/modulus.rs
b/datafusion/spark/src/function/math/modulus.rs
index 49657e2cb8..7a21aabbdf 100644
--- a/datafusion/spark/src/function/math/modulus.rs
+++ b/datafusion/spark/src/function/math/modulus.rs
@@ -15,8 +15,13 @@
// specific language governing permissions and limitations
// under the License.
+use arrow::array::{Scalar, new_null_array};
use arrow::compute::kernels::numeric::add;
-use arrow::compute::kernels::{cmp::lt, numeric::rem, zip::zip};
+use arrow::compute::kernels::{
+ cmp::{eq, lt},
+ numeric::rem,
+ zip::zip,
+};
use arrow::datatypes::DataType;
use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err};
use datafusion_expr::{
@@ -24,28 +29,61 @@ use datafusion_expr::{
};
use std::any::Any;
+/// Attempts `rem(left, right)` with per-element divide-by-zero handling.
+/// In ANSI mode, any zero divisor causes an error.
+/// In legacy mode (ANSI off), positions where the divisor is zero return NULL
+/// while other positions compute normally.
+fn try_rem(
+ left: &arrow::array::ArrayRef,
+ right: &arrow::array::ArrayRef,
+ enable_ansi_mode: bool,
+) -> Result<arrow::array::ArrayRef> {
+ match rem(left, right) {
+ Ok(result) => Ok(result),
+ Err(arrow::error::ArrowError::DivideByZero) if !enable_ansi_mode => {
+ // Integer rem fails when ANY divisor element is zero.
+ // Handle per-element: null out zero divisors
+ let zero = ScalarValue::new_zero(right.data_type())?.to_array()?;
+ let zero = Scalar::new(zero);
+ let null = Scalar::new(new_null_array(right.data_type(), 1));
+ let is_zero = eq(right, &zero)?;
+ let safe_right = zip(&is_zero, &null, right)?;
+ Ok(rem(left, &safe_right)?)
+ }
+ Err(e) => Err(e.into()),
+ }
+}
+
/// Spark-compatible `mod` function
-/// This function directly uses Arrow's arithmetic_op function for modulo
operations
-pub fn spark_mod(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+/// In ANSI mode, division by zero throws an error.
+/// In legacy mode, division by zero returns NULL (Spark behavior).
+pub fn spark_mod(
+ args: &[ColumnarValue],
+ enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
assert_eq_or_internal_err!(args.len(), 2, "mod expects exactly two
arguments");
let args = ColumnarValue::values_to_arrays(args)?;
- let result = rem(&args[0], &args[1])?;
+ let result = try_rem(&args[0], &args[1], enable_ansi_mode)?;
Ok(ColumnarValue::Array(result))
}
/// Spark-compatible `pmod` function
-/// This function directly uses Arrow's arithmetic_op function for modulo
operations
-pub fn spark_pmod(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+/// In ANSI mode, division by zero throws an error.
+/// In legacy mode, division by zero returns NULL (Spark behavior).
+pub fn spark_pmod(
+ args: &[ColumnarValue],
+ enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
assert_eq_or_internal_err!(args.len(), 2, "pmod expects exactly two
arguments");
let args = ColumnarValue::values_to_arrays(args)?;
let left = &args[0];
let right = &args[1];
let zero =
ScalarValue::new_zero(left.data_type())?.to_array_of_size(left.len())?;
- let result = rem(left, right)?;
+ let result = try_rem(left, right, enable_ansi_mode)?;
let neg = lt(&result, &zero)?;
let plus = zip(&neg, right, &zero)?;
let result = add(&plus, &result)?;
- let result = rem(&result, right)?;
+ let result = try_rem(&result, right, enable_ansi_mode)?;
Ok(ColumnarValue::Array(result))
}
@@ -95,7 +133,7 @@ impl ScalarUDFImpl for SparkMod {
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- spark_mod(&args.args)
+ spark_mod(&args.args, args.config_options.execution.enable_ansi_mode)
}
}
@@ -145,7 +183,7 @@ impl ScalarUDFImpl for SparkPmod {
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- spark_pmod(&args.args)
+ spark_pmod(&args.args, args.config_options.execution.enable_ansi_mode)
}
}
@@ -165,7 +203,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_mod(&[left_value, right_value]).unwrap();
+ let result = spark_mod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int32 =
@@ -187,7 +225,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_mod(&[left_value, right_value]).unwrap();
+ let result = spark_mod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int64 =
@@ -228,7 +266,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_mod(&[left_value, right_value]).unwrap();
+ let result = spark_mod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_float64 = result_array
@@ -284,7 +322,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_mod(&[left_value, right_value]).unwrap();
+ let result = spark_mod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_float32 = result_array
@@ -319,7 +357,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
- let result = spark_mod(&[left_value, right_value]).unwrap();
+ let result = spark_mod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int32 =
@@ -337,20 +375,43 @@ mod test {
let left = Int32Array::from(vec![Some(10)]);
let left_value = ColumnarValue::Array(Arc::new(left));
- let result = spark_mod(&[left_value]);
+ let result = spark_mod(&[left_value], false);
assert!(result.is_err());
}
#[test]
- fn test_mod_zero_division() {
+ fn test_mod_zero_division_legacy() {
+ // In legacy mode (ANSI off), division by zero returns NULL per-element
+ let left = Int32Array::from(vec![Some(10), Some(7), Some(15)]);
+ let right = Int32Array::from(vec![Some(0), Some(2), Some(4)]);
+
+ let left_value = ColumnarValue::Array(Arc::new(left));
+ let right_value = ColumnarValue::Array(Arc::new(right));
+
+ let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+ if let ColumnarValue::Array(result_array) = result {
+ let result_int32 =
+ result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert!(result_int32.is_null(0)); // 10 % 0 = NULL
+ assert_eq!(result_int32.value(1), 1); // 7 % 2 = 1
+ assert_eq!(result_int32.value(2), 3); // 15 % 4 = 3
+ } else {
+ panic!("Expected array result");
+ }
+ }
+
+ #[test]
+ fn test_mod_zero_division_ansi() {
+ // In ANSI mode, division by zero should error
let left = Int32Array::from(vec![Some(10), Some(7), Some(15)]);
let right = Int32Array::from(vec![Some(0), Some(2), Some(4)]);
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_mod(&[left_value, right_value]);
- assert!(result.is_err()); // Division by zero should error
+ let result = spark_mod(&[left_value, right_value], true);
+ assert!(result.is_err());
}
// PMOD tests
@@ -362,7 +423,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_pmod(&[left_value, right_value]).unwrap();
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int32 =
@@ -385,7 +446,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_pmod(&[left_value, right_value]).unwrap();
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int64 =
@@ -425,7 +486,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_pmod(&[left_value, right_value]).unwrap();
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_float64 = result_array
@@ -476,7 +537,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_pmod(&[left_value, right_value]).unwrap();
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_float32 = result_array
@@ -508,7 +569,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
- let result = spark_pmod(&[left_value, right_value]).unwrap();
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int32 =
@@ -527,20 +588,43 @@ mod test {
let left = Int32Array::from(vec![Some(10)]);
let left_value = ColumnarValue::Array(Arc::new(left));
- let result = spark_pmod(&[left_value]);
+ let result = spark_pmod(&[left_value], false);
assert!(result.is_err());
}
#[test]
- fn test_pmod_zero_division() {
+ fn test_pmod_zero_division_legacy() {
+ // In legacy mode (ANSI off), division by zero returns NULL per-element
let left = Int32Array::from(vec![Some(10), Some(-7), Some(15)]);
let right = Int32Array::from(vec![Some(0), Some(0), Some(4)]);
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_pmod(&[left_value, right_value]);
- assert!(result.is_err()); // Division by zero should error
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+ if let ColumnarValue::Array(result_array) = result {
+ let result_int32 =
+ result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert!(result_int32.is_null(0)); // 10 pmod 0 = NULL
+ assert!(result_int32.is_null(1)); // -7 pmod 0 = NULL
+ assert_eq!(result_int32.value(2), 3); // 15 pmod 4 = 3
+ } else {
+ panic!("Expected array result");
+ }
+ }
+
+ #[test]
+ fn test_pmod_zero_division_ansi() {
+ // In ANSI mode, division by zero should error
+ let left = Int32Array::from(vec![Some(10), Some(-7), Some(15)]);
+ let right = Int32Array::from(vec![Some(0), Some(0), Some(4)]);
+
+ let left_value = ColumnarValue::Array(Arc::new(left));
+ let right_value = ColumnarValue::Array(Arc::new(right));
+
+ let result = spark_pmod(&[left_value, right_value], true);
+ assert!(result.is_err());
}
#[test]
@@ -552,7 +636,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_pmod(&[left_value, right_value]).unwrap();
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int32 =
@@ -590,7 +674,7 @@ mod test {
let left_value = ColumnarValue::Array(Arc::new(left));
let right_value = ColumnarValue::Array(Arc::new(right));
- let result = spark_pmod(&[left_value, right_value]).unwrap();
+ let result = spark_pmod(&[left_value, right_value], false).unwrap();
if let ColumnarValue::Array(result_array) = result {
let result_int32 =
diff --git a/datafusion/sqllogictest/test_files/spark/math/mod.slt
b/datafusion/sqllogictest/test_files/spark/math/mod.slt
index 2780b3e105..68c0f59f48 100644
--- a/datafusion/sqllogictest/test_files/spark/math/mod.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/mod.slt
@@ -144,6 +144,35 @@ SELECT MOD(10.0::decimal(3,1), 3.0::decimal(2,1)) as
mod_decimal_2;
----
1
+# Division by zero returns NULL in legacy mode (ANSI off)
+query I
+SELECT MOD(10::int, 0::int) as mod_div_zero_1;
+----
+NULL
+
+query I
+SELECT MOD(-7::int, 0::int) as mod_div_zero_2;
+----
+NULL
+
+query R
+SELECT MOD(10.5::float8, 0.0::float8) as mod_div_zero_float;
+----
+NaN
+
+# Division by zero errors in ANSI mode
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT MOD(10::int, 0::int);
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT MOD(-7::int, 0::int);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
# Edge cases
query I
SELECT MOD(0::int, 5::int) as mod_zero_1;
diff --git a/datafusion/sqllogictest/test_files/spark/math/pmod.slt
b/datafusion/sqllogictest/test_files/spark/math/pmod.slt
index cf273c2d78..aa4a197ba4 100644
--- a/datafusion/sqllogictest/test_files/spark/math/pmod.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/pmod.slt
@@ -64,8 +64,28 @@ SELECT pmod(0::int, 5::int) as pmod_zero_1;
----
0
-statement error DataFusion error: Arrow error: Divide by zero error
+query I
SELECT pmod(10::int, 0::int) as pmod_zero_2;
+----
+NULL
+
+query I
+SELECT pmod(-7::int, 0::int) as pmod_zero_3;
+----
+NULL
+
+# Division by zero errors in ANSI mode
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT pmod(10::int, 0::int);
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT pmod(-7::int, 0::int);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
# PMOD tests with NULL values
query I
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]