viirya commented on code in PR #7878:
URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2196637245


##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -228,9 +225,48 @@ impl<'m> VariantMetadata<'m> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            let offset_bytes = slice_from_slice(
+                self.bytes,
+                self.header.first_offset_byte()..self.first_value_byte,
+            )?;
+
+            let offsets =
+                map_bytes_to_offsets(offset_bytes, 
self.header.offset_size).collect::<Vec<_>>();
+
+            // Validate offsets are in-bounds and monotonically increasing.
+            // Since shallow validation ensures the first and last offsets are 
in bounds, we can also verify all offsets
+            // are in-bounds by checking if offsets are monotonically 
increasing.
+            let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
+            if !are_offsets_monotonic {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offsets not monotonically increasing".to_string(),
+                ));
+            }
+
+            // Verify the string values in the dictionary are UTF-8 encoded 
strings.
+            let value_buffer = slice_from_slice(self.bytes, 
self.first_value_byte..)?;
+            let value_str = simdutf8::basic::from_utf8(value_buffer)
+                .map_err(|e| 
ArrowError::InvalidArgumentError(format!("{e:?}")))?;

Review Comment:
   If there is error, will `e` contain the whole invalid bytes? If so, the 
error output might be hard to read.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to