This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 710e68e664 Prevent ArrayData validation length overflow (#9816)
710e68e664 is described below

commit 710e68e664ea2f17986d70e621634d76544fe2c6
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Apr 27 15:53:44 2026 -0400

    Prevent ArrayData validation length overflow (#9816)
    
    # Which issue does this PR close?
    
    - None.
    
    # Rationale for this change
    
    `ArrayData` validation used unchecked `usize` arithmetic when combining
    array lengths and offsets. In optimized builds, very large lengths could
    wrap these calculations and allow invalid `ArrayData` metadata to pass
    validation.
    
    # What changes are included in this PR?
    
    This adds checked arithmetic for length plus offset calculations in
    `ArrayData` validation, including offset-buffer validation and related
    typed-buffer sizing paths.
    
    # Are these changes tested?
    
    Yes. This adds regression coverage for overflowing offset-buffer and
    typed-buffer length calculations.
    
    Validated with:
    
    ```bash
    cargo test -p arrow-data overflow --release
    ```
    
    # Are there any user-facing changes?
    
    Invalid `ArrayData` whose length and offset cannot be represented
    without overflow now returns an validation error consistently across
    build modes. There are no API changes.
---
 arrow-data/src/data.rs | 139 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 122 insertions(+), 17 deletions(-)

diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index a5a64dfe9f..26f59429ee 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -253,6 +253,18 @@ pub struct ArrayData {
 /// A thread-safe, shared reference to the Arrow array data.
 pub type ArrayDataRef = Arc<ArrayData>;
 
+fn checked_len_plus_offset(
+    data_type: &DataType,
+    len: usize,
+    offset: usize,
+) -> Result<usize, ArrowError> {
+    len.checked_add(offset).ok_or_else(|| {
+        ArrowError::InvalidArgumentError(format!(
+            "Length {len} with offset {offset} overflows usize for {data_type}"
+        ))
+    })
+}
+
 impl ArrayData {
     /// Create a new ArrayData instance;
     ///
@@ -324,7 +336,8 @@ impl ArrayData {
         // because we use this buffer to calculate `null_count`
         // in `Self::new_unchecked`.
         if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
-            let needed_len = bit_util::ceil(len + offset, 8);
+            let len_plus_offset = checked_len_plus_offset(&data_type, len, 
offset)?;
+            let needed_len = bit_util::ceil(len_plus_offset, 8);
             if null_bit_buffer.len() < needed_len {
                 return Err(ArrowError::InvalidArgumentError(format!(
                     "null_bit_buffer size too small. got {} needed {}",
@@ -814,8 +827,8 @@ impl ArrayData {
     /// See [ArrayData::validate_data] to validate fully the offset content
     /// and the validity of utf8 data
     pub fn validate(&self) -> Result<(), ArrowError> {
-        // Need at least this mich space in each buffer
-        let len_plus_offset = self.len + self.offset;
+        // Need at least this much space in each buffer
+        let len_plus_offset = checked_len_plus_offset(&self.data_type, 
self.len, self.offset)?;
 
         // Check that the data layout conforms to the spec
         let layout = layout(&self.data_type);
@@ -965,7 +978,9 @@ impl ArrayData {
             return Ok(&[]);
         }
 
-        self.typed_buffer(0, self.len + 1)
+        let len = checked_len_plus_offset(&self.data_type, self.len, 1)?;
+
+        self.typed_buffer(0, len)
     }
 
     /// Returns a reference to the data in `buffers[idx]` as a typed slice 
after validating
@@ -976,7 +991,14 @@ impl ArrayData {
     ) -> Result<&[T], ArrowError> {
         let buffer = &self.buffers[idx];
 
-        let required_len = (len + self.offset) * mem::size_of::<T>();
+        let required_elements = checked_len_plus_offset(&self.data_type, len, 
self.offset)?;
+        let byte_width = mem::size_of::<T>();
+        let required_len = 
required_elements.checked_mul(byte_width).ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Buffer {idx} of {} byte length overflow: {} elements of {} 
bytes exceeds usize",
+                self.data_type, required_elements, byte_width
+            ))
+        })?;
 
         if buffer.len() < required_len {
             return Err(ArrowError::InvalidArgumentError(format!(
@@ -988,7 +1010,7 @@ impl ArrayData {
             )));
         }
 
-        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
+        Ok(&buffer.typed_data::<T>()[self.offset..required_elements])
     }
 
     /// Does a cheap sanity check that the `self.len` values in `buffer` are 
valid
@@ -1172,13 +1194,15 @@ impl ArrayData {
                 for (i, (_, field)) in fields.iter().enumerate() {
                     let field_data = self.get_valid_child_data(i, 
field.data_type())?;
 
-                    if mode == &UnionMode::Sparse && field_data.len < 
(self.len + self.offset) {
-                        return Err(ArrowError::InvalidArgumentError(format!(
-                            "Sparse union child array #{} has length smaller 
than expected for union array ({} < {})",
-                            i,
-                            field_data.len,
-                            self.len + self.offset
-                        )));
+                    if mode == &UnionMode::Sparse {
+                        let len_plus_offset =
+                            checked_len_plus_offset(&self.data_type, self.len, 
self.offset)?;
+                        if field_data.len < len_plus_offset {
+                            return 
Err(ArrowError::InvalidArgumentError(format!(
+                                "Sparse union child array #{} has length 
smaller than expected for union array ({} < {})",
+                                i, field_data.len, len_plus_offset
+                            )));
+                        }
                     }
                 }
                 Ok(())
@@ -1554,7 +1578,7 @@ impl ArrayData {
     where
         T: ArrowNativeType + TryInto<i64> + num_traits::Num + 
std::fmt::Display,
     {
-        let required_len = self.len + self.offset;
+        let required_len = checked_len_plus_offset(&self.data_type, self.len, 
self.offset)?;
         let buffer = &self.buffers[0];
 
         // This should have been checked as part of `validate()` prior
@@ -1562,7 +1586,7 @@ impl ArrayData {
         assert!(buffer.len() / mem::size_of::<T>() >= required_len);
 
         // Justification: buffer size was validated above
-        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset 
+ self.len];
+        let indexes: &[T] = 
&buffer.typed_data::<T>()[self.offset..required_len];
 
         indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
             // Do not check the value is null (value can be arbitrary)
@@ -1612,10 +1636,11 @@ impl ArrayData {
             Ok(())
         })?;
 
-        if prev_value.as_usize() < (self.offset + self.len) {
+        let len_plus_offset = checked_len_plus_offset(&self.data_type, 
self.len, self.offset)?;
+        if prev_value.as_usize() < len_plus_offset {
             return Err(ArrowError::InvalidArgumentError(format!(
                 "The offset + length of array should be less or equal to last 
value in the run_ends array. The last value of run_ends array is {prev_value} 
and offset + length of array is {}.",
-                self.offset + self.len
+                len_plus_offset
             )));
         }
         Ok(())
@@ -2372,6 +2397,86 @@ mod tests {
         assert_eq!(data.null_count() - 1, new_data.null_count());
     }
 
+    #[test]
+    fn test_typed_offsets_length_overflow() {
+        let data = ArrayData {
+            data_type: DataType::Binary,
+            len: usize::MAX,
+            offset: 0,
+            buffers: vec![Buffer::from_slice_ref([0_i32])],
+            child_data: vec![],
+            nulls: None,
+        };
+        let err = data.typed_offsets::<i32>().unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            format!(
+                "Invalid argument error: Length {} with offset 1 overflows 
usize for Binary",
+                usize::MAX
+            )
+        );
+    }
+
+    #[test]
+    fn test_validate_typed_buffer_length_overflow() {
+        let data = ArrayData {
+            data_type: DataType::Binary,
+            len: 0,
+            offset: 2,
+            buffers: vec![Buffer::from_slice_ref([0_i32])],
+            child_data: vec![],
+            nulls: None,
+        };
+        let err = data.typed_buffer::<i32>(0, usize::MAX).unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            format!(
+                "Invalid argument error: Length {} with offset 2 overflows 
usize for Binary",
+                usize::MAX
+            )
+        );
+    }
+
+    // Exercises ArrayData::try_new with len + offset overflowing
+    fn try_new_binary_length_offset_overflow() -> Result<ArrayData, 
ArrowError> {
+        ArrayData::try_new(
+            DataType::Binary,
+            usize::MAX,
+            None,
+            1,
+            vec![
+                Buffer::from_slice_ref([0_i32]),
+                Buffer::from_iter(std::iter::empty::<u8>()),
+            ],
+            vec![],
+        )
+    }
+
+    #[cfg(not(feature = "force_validate"))]
+    #[test]
+    fn test_try_new_length_offset_overflow() {
+        let err = try_new_binary_length_offset_overflow().unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            format!(
+                "Invalid argument error: Length {} with offset 1 overflows 
usize for Binary",
+                usize::MAX
+            )
+        );
+    }
+
+    #[cfg(feature = "force_validate")]
+    #[test]
+    #[should_panic(
+        expected = "Length 18446744073709551615 with offset 1 overflows usize 
for Binary"
+    )]
+    fn test_try_new_length_offset_overflow_force_validate() {
+        try_new_binary_length_offset_overflow().unwrap();
+    }
+
     #[test]
     fn test_equality() {
         let int_data = ArrayData::builder(DataType::Int32)

Reply via email to