This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 794660482b Prevent overflow and panics when casting DATE to TIMESTAMP 
by validating bounds (#18761)
794660482b is described below

commit 794660482bb21a9f974c6345d4b9a631009a316b
Author: kosiew <[email protected]>
AuthorDate: Wed Nov 19 21:47:01 2025 +0800

    Prevent overflow and panics when casting DATE to TIMESTAMP by validating 
bounds (#18761)
    
    ## Which issue does this PR close?
    
    * Closes #17534.
    
    ## Rationale for this change
    
    This change ensures that casting from `Date32` or `Date64` to timestamp
    types behaves safely and predictably. Previously, extreme date values
    (e.g., `9999-12-31`) could cause integer overflow during unit
    conversion, leading to panics in debug mode and silent incorrect results
    in release mode. This patch introduces explicit bounds checking so these
    cases return clear, user-facing errors instead of panicking.
    
    ### Before
    ```
    ❯ cargo run --bin datafusion-cli -- --command "SELECT CAST(DATE 
'9999-12-31' As timestamp);"
        Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.48s
         Running `target/debug/datafusion-cli --command 'SELECT CAST(DATE 
'\''9999-12-31'\'' As timestamp);'`
    DataFusion CLI v51.0.0
    
    thread 'main' panicked at 
/Users/kosiew/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/arrow-cast-57.0.0/src/cast/mod.rs:2079:58:
    attempt to multiply with overflow
    
    ```
    ### After
    ```
    ❯ cargo run --bin datafusion-cli -- --command "SELECT CAST(DATE 
'9999-12-31' As timestamp);"
        Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.64s
         Running `target/debug/datafusion-cli --command 'SELECT CAST(DATE 
'\''9999-12-31'\'' As timestamp);'`
    DataFusion CLI v51.0.0
    Error: Optimizer rule 'simplify_expressions' failed
    caused by
    Execution error: Cannot cast Date32 value 2932896 to Timestamp(ns): 
timestamp values are limited to +/-2262 years
    ```
    
    ## What changes are included in this PR?
    
    * Introduces `date_to_timestamp_multiplier` and
    `ensure_timestamp_in_bounds` to centralize safe conversion logic.
    * Adds bounds validation for both scalar and array-based casts,
    preventing overflow when multiplying date values.
    * Enhances error messages with consistent timestamp type formatting.
    * Integrates bounds checks into `ScalarValue::cast_to` and
    `ColumnarValue::cast_to`.
    * Adds comprehensive tests covering formatting, overflow detection,
    scalar casts, array casts, and SQL-level behavior.
    
    ## Are these changes tested?
    
    Yes. The PR includes new unit tests validating:
    
    * Formatting of timestamp types in error messages.
    * Overflow detection for both `Date32` and `Date64` to nanoseconds.
    * Array-based overflow handling.
    * SQL-level behavior for casting large date literals.
    
    ## Are there any user-facing changes?
    
    Yes. Users now receive clear and consistent error messages when
    attempting to cast dates that exceed the representable timestamp range
    (approximately ±2262 years). Instead of panics or silent overflow, a
    descriptive execution error is returned.
    
    ## LLM-generated code disclosure
    
    This pull request includes LLM-generated content that has been manually
    reviewed and tested.
---
 datafusion/common/src/scalar/mod.rs          | 174 ++++++++++++++++++++++++++-
 datafusion/core/tests/sql/select.rs          |  17 +++
 datafusion/expr-common/src/columnar_value.rs | 103 ++++++++++++++--
 3 files changed, 283 insertions(+), 11 deletions(-)

diff --git a/datafusion/common/src/scalar/mod.rs 
b/datafusion/common/src/scalar/mod.rs
index 787bd78b1d..3fd5a37224 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -92,6 +92,103 @@ use chrono::{Duration, NaiveDate};
 use half::f16;
 pub use struct_builder::ScalarStructBuilder;
 
+const SECONDS_PER_DAY: i64 = 86_400;
+const MILLIS_PER_DAY: i64 = SECONDS_PER_DAY * 1_000;
+const MICROS_PER_DAY: i64 = MILLIS_PER_DAY * 1_000;
+const NANOS_PER_DAY: i64 = MICROS_PER_DAY * 1_000;
+const MICROS_PER_MILLISECOND: i64 = 1_000;
+const NANOS_PER_MILLISECOND: i64 = 1_000_000;
+
+/// Returns the multiplier that converts the input date representation into the
+/// desired timestamp unit, if the conversion requires a multiplication that 
can
+/// overflow an `i64`.
+pub fn date_to_timestamp_multiplier(
+    source_type: &DataType,
+    target_type: &DataType,
+) -> Option<i64> {
+    let DataType::Timestamp(target_unit, _) = target_type else {
+        return None;
+    };
+
+    // Only `Timestamp` target types have a time unit; otherwise no
+    // multiplier applies (handled above). The function returns `Some(m)`
+    // when converting the `source_type` to `target_type` requires a
+    // multiplication that could overflow `i64`. It returns `None` when
+    // the conversion is a division or otherwise doesn't require a
+    // multiplication (e.g. Date64 -> Second).
+    match source_type {
+        // Date32 stores days since epoch. Converting to any timestamp
+        // unit requires multiplying by the per-day factor (seconds,
+        // milliseconds, microseconds, nanoseconds).
+        DataType::Date32 => Some(match target_unit {
+            TimeUnit::Second => SECONDS_PER_DAY,
+            TimeUnit::Millisecond => MILLIS_PER_DAY,
+            TimeUnit::Microsecond => MICROS_PER_DAY,
+            TimeUnit::Nanosecond => NANOS_PER_DAY,
+        }),
+
+        // Date64 stores milliseconds since epoch. Converting to
+        // seconds is a division (no multiplication), so return `None`.
+        // Converting to milliseconds is 1:1 (multiplier 1). Converting
+        // to micro/nano requires multiplying by 1_000 / 1_000_000.
+        DataType::Date64 => match target_unit {
+            TimeUnit::Second => None,
+            // Converting Date64 (ms since epoch) to millisecond timestamps
+            // is an identity conversion and does not require multiplication.
+            // Returning `None` indicates no multiplication-based overflow
+            // check is necessary.
+            TimeUnit::Millisecond => None,
+            TimeUnit::Microsecond => Some(MICROS_PER_MILLISECOND),
+            TimeUnit::Nanosecond => Some(NANOS_PER_MILLISECOND),
+        },
+
+        _ => None,
+    }
+}
+
+/// Ensures the provided value can be represented as a timestamp with the given
+/// multiplier. Returns an [`DataFusionError::Execution`] when the converted
+/// value would overflow the timestamp range.
+pub fn ensure_timestamp_in_bounds(
+    value: i64,
+    multiplier: i64,
+    source_type: &DataType,
+    target_type: &DataType,
+) -> Result<()> {
+    if multiplier <= 1 {
+        return Ok(());
+    }
+
+    if value.checked_mul(multiplier).is_none() {
+        let target = format_timestamp_type_for_error(target_type);
+        _exec_err!(
+            "Cannot cast {} value {} to {}: converted value exceeds the 
representable i64 range",
+            source_type,
+            value,
+            target
+        )
+    } else {
+        Ok(())
+    }
+}
+
+/// Format a `DataType::Timestamp` into a short, stable string used in
+/// user-facing error messages.
+pub(crate) fn format_timestamp_type_for_error(target_type: &DataType) -> 
String {
+    match target_type {
+        DataType::Timestamp(unit, _) => {
+            let s = match unit {
+                TimeUnit::Second => "s",
+                TimeUnit::Millisecond => "ms",
+                TimeUnit::Microsecond => "us",
+                TimeUnit::Nanosecond => "ns",
+            };
+            format!("Timestamp({s})")
+        }
+        other => format!("{other}"),
+    }
+}
+
 /// A dynamically typed, nullable single value.
 ///
 /// While an arrow  [`Array`]) stores one or more values of the same type, in a
@@ -3619,11 +3716,27 @@ impl ScalarValue {
         target_type: &DataType,
         cast_options: &CastOptions<'static>,
     ) -> Result<Self> {
+        let source_type = self.data_type();
+        if let Some(multiplier) = date_to_timestamp_multiplier(&source_type, 
target_type)
+        {
+            if let Some(value) = self.date_scalar_value_as_i64() {
+                ensure_timestamp_in_bounds(value, multiplier, &source_type, 
target_type)?;
+            }
+        }
+
         let scalar_array = self.to_array()?;
         let cast_arr = cast_with_options(&scalar_array, target_type, 
cast_options)?;
         ScalarValue::try_from_array(&cast_arr, 0)
     }
 
+    fn date_scalar_value_as_i64(&self) -> Option<i64> {
+        match self {
+            ScalarValue::Date32(Some(value)) => Some(i64::from(*value)),
+            ScalarValue::Date64(Some(value)) => Some(*value),
+            _ => None,
+        }
+    }
+
     fn eq_array_decimal32(
         array: &ArrayRef,
         index: usize,
@@ -4991,7 +5104,7 @@ mod tests {
     use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer};
     use arrow::compute::{is_null, kernels};
     use arrow::datatypes::{
-        ArrowNumericType, Fields, Float64Type, DECIMAL256_MAX_PRECISION,
+        ArrowNumericType, Fields, Float64Type, TimeUnit, 
DECIMAL256_MAX_PRECISION,
     };
     use arrow::error::ArrowError;
     use arrow::util::pretty::pretty_format_columns;
@@ -5024,6 +5137,52 @@ mod tests {
         assert_eq!(actual, &expected);
     }
 
+    #[test]
+    fn test_format_timestamp_type_for_error_and_bounds() {
+        // format helper
+        let ts_ns = format_timestamp_type_for_error(&DataType::Timestamp(
+            TimeUnit::Nanosecond,
+            None,
+        ));
+        assert_eq!(ts_ns, "Timestamp(ns)");
+
+        let ts_us = format_timestamp_type_for_error(&DataType::Timestamp(
+            TimeUnit::Microsecond,
+            None,
+        ));
+        assert_eq!(ts_us, "Timestamp(us)");
+
+        // ensure_timestamp_in_bounds: Date32 non-overflow
+        let ok = ensure_timestamp_in_bounds(
+            1000,
+            NANOS_PER_DAY,
+            &DataType::Date32,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(ok.is_ok());
+
+        // Date32 overflow -- known large day value (9999-12-31 -> 2932896)
+        let err = ensure_timestamp_in_bounds(
+            2932896,
+            NANOS_PER_DAY,
+            &DataType::Date32,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(err.is_err());
+        let msg = err.unwrap_err().to_string();
+        assert!(msg.contains("Cannot cast Date32 value 2932896 to 
Timestamp(ns): converted value exceeds the representable i64 range"));
+
+        // Date64 overflow for ns (millis * 1_000_000)
+        let overflow_millis: i64 = (i64::MAX / NANOS_PER_MILLISECOND) + 1;
+        let err2 = ensure_timestamp_in_bounds(
+            overflow_millis,
+            NANOS_PER_MILLISECOND,
+            &DataType::Date64,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(err2.is_err());
+    }
+
     #[test]
     fn test_scalar_value_from_for_struct() {
         let boolean = Arc::new(BooleanArray::from(vec![false]));
@@ -8605,6 +8764,19 @@ mod tests {
         assert!(dense_scalar.is_null());
     }
 
+    #[test]
+    fn cast_date_to_timestamp_overflow_returns_error() {
+        let scalar = ScalarValue::Date32(Some(i32::MAX));
+        let err = scalar
+            .cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None))
+            .expect_err("expected cast to fail");
+        assert!(
+            err.to_string()
+                .contains("converted value exceeds the representable i64 
range"),
+            "unexpected error: {err}"
+        );
+    }
+
     #[test]
     fn null_dictionary_scalar_produces_null_dictionary_array() {
         let dictionary_scalar = ScalarValue::Dictionary(
diff --git a/datafusion/core/tests/sql/select.rs 
b/datafusion/core/tests/sql/select.rs
index 28f0dcb8bd..84899137e5 100644
--- a/datafusion/core/tests/sql/select.rs
+++ b/datafusion/core/tests/sql/select.rs
@@ -414,3 +414,20 @@ async fn test_select_no_projection() -> Result<()> {
     ");
     Ok(())
 }
+
+#[tokio::test]
+async fn test_select_cast_date_literal_to_timestamp_overflow() -> Result<()> {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .sql("SELECT CAST(DATE '9999-12-31' AS TIMESTAMP)")
+        .await?
+        .collect()
+        .await
+        .unwrap_err();
+
+    assert_contains!(
+        err.to_string(),
+        "Cannot cast Date32 value 2932896 to Timestamp(ns): converted value 
exceeds the representable i64 range"
+    );
+    Ok(())
+}
diff --git a/datafusion/expr-common/src/columnar_value.rs 
b/datafusion/expr-common/src/columnar_value.rs
index d508164b6b..585b47a980 100644
--- a/datafusion/expr-common/src/columnar_value.rs
+++ b/datafusion/expr-common/src/columnar_value.rs
@@ -17,12 +17,19 @@
 
 //! [`ColumnarValue`] represents the result of evaluating an expression.
 
-use arrow::array::{Array, ArrayRef, NullArray};
-use arrow::compute::{kernels, CastOptions};
-use arrow::datatypes::DataType;
-use arrow::util::pretty::pretty_format_columns;
-use datafusion_common::format::DEFAULT_CAST_OPTIONS;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use arrow::{
+    array::{Array, ArrayRef, Date32Array, Date64Array, NullArray},
+    compute::{kernels, max, min, CastOptions},
+    datatypes::DataType,
+    util::pretty::pretty_format_columns,
+};
+use datafusion_common::internal_datafusion_err;
+use datafusion_common::{
+    format::DEFAULT_CAST_OPTIONS,
+    internal_err,
+    scalar::{date_to_timestamp_multiplier, ensure_timestamp_in_bounds},
+    Result, ScalarValue,
+};
 use std::fmt;
 use std::sync::Arc;
 
@@ -275,9 +282,14 @@ impl ColumnarValue {
     ) -> Result<ColumnarValue> {
         let cast_options = 
cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
         match self {
-            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
-                kernels::cast::cast_with_options(array, cast_type, 
&cast_options)?,
-            )),
+            ColumnarValue::Array(array) => {
+                ensure_date_array_timestamp_bounds(array, cast_type)?;
+                Ok(ColumnarValue::Array(kernels::cast::cast_with_options(
+                    array,
+                    cast_type,
+                    &cast_options,
+                )?))
+            }
             ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
                 scalar.cast_to_with_options(cast_type, &cast_options)?,
             )),
@@ -285,6 +297,59 @@ impl ColumnarValue {
     }
 }
 
+fn ensure_date_array_timestamp_bounds(
+    array: &ArrayRef,
+    cast_type: &DataType,
+) -> Result<()> {
+    let source_type = array.data_type().clone();
+    let Some(multiplier) = date_to_timestamp_multiplier(&source_type, 
cast_type) else {
+        return Ok(());
+    };
+
+    if multiplier <= 1 {
+        return Ok(());
+    }
+
+    // Use compute kernels to find min/max instead of iterating all elements
+    let (min_val, max_val): (Option<i64>, Option<i64>) = match &source_type {
+        DataType::Date32 => {
+            let arr = array
+                .as_any()
+                .downcast_ref::<Date32Array>()
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Expected Date32Array but found {}",
+                        array.data_type()
+                    )
+                })?;
+            (min(arr).map(|v| v as i64), max(arr).map(|v| v as i64))
+        }
+        DataType::Date64 => {
+            let arr = array
+                .as_any()
+                .downcast_ref::<Date64Array>()
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Expected Date64Array but found {}",
+                        array.data_type()
+                    )
+                })?;
+            (min(arr), max(arr))
+        }
+        _ => return Ok(()), // Not a date type, nothing to do
+    };
+
+    // Only validate the min and max values instead of all elements
+    if let Some(min) = min_val {
+        ensure_timestamp_in_bounds(min, multiplier, &source_type, cast_type)?;
+    }
+    if let Some(max) = max_val {
+        ensure_timestamp_in_bounds(max, multiplier, &source_type, cast_type)?;
+    }
+
+    Ok(())
+}
+
 // Implement Display trait for ColumnarValue
 impl fmt::Display for ColumnarValue {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -312,7 +377,10 @@ impl fmt::Display for ColumnarValue {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::Int32Array;
+    use arrow::{
+        array::{Date64Array, Int32Array},
+        datatypes::TimeUnit,
+    };
 
     #[test]
     fn into_array_of_size() {
@@ -484,4 +552,19 @@ mod tests {
             )
         );
     }
+
+    #[test]
+    fn cast_date64_array_to_timestamp_overflow() {
+        let overflow_value = i64::MAX / 1_000_000 + 1;
+        let array: ArrayRef = 
Arc::new(Date64Array::from(vec![Some(overflow_value)]));
+        let value = ColumnarValue::Array(array);
+        let result =
+            value.cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None), 
None);
+        let err = result.expect_err("expected overflow to be detected");
+        assert!(
+            err.to_string()
+                .contains("converted value exceeds the representable i64 
range"),
+            "unexpected error: {err}"
+        );
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to