viirya commented on code in PR #7878: URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2196635404
########## parquet-variant/src/variant/metadata.rs: ########## @@ -228,9 +225,48 @@ impl<'m> VariantMetadata<'m> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + let offset_bytes = slice_from_slice( + self.bytes, + self.header.first_offset_byte()..self.first_value_byte, + )?; + + let offsets = + map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>(); + + // Validate offsets are in-bounds and monotonically increasing. + // Since shallow validation ensures the first and last offsets are in bounds, we can also verify all offsets + // are in-bounds by checking if offsets are monotonically increasing. + let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); + if !are_offsets_monotonic { + return Err(ArrowError::InvalidArgumentError( + "offsets not monotonically increasing".to_string(), + )); + } + + // Verify the string values in the dictionary are UTF-8 encoded strings. + let value_buffer = slice_from_slice(self.bytes, self.first_value_byte..)?; + let value_str = simdutf8::basic::from_utf8(value_buffer) Review Comment: What is the difference between the currently used `str::from_utf8` and this `simdutf8::basic::from_utf8`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org