friendlymatthew commented on code in PR #7878:
URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2192450387


##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation
+
+            let offsets = offset_chunks
+                .map(|chunk| match self.header.offset_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            // (3)
+            let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b);
+            if !monotonic_offsets {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offsets are not monotonically increasing".to_string(),
+                ));
+            }
+
+            // (4)
+
+            let value_buffer = &self.value[self.first_value_byte..];

Review Comment:
   Yup, we should be using `slice_from_slice` throughout this PR



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to