viirya commented on code in PR #7878: URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2198202500
########## parquet-variant/src/variant/object.rs: ########## @@ -210,9 +209,80 @@ impl<'m, 'v> VariantObject<'m, 'v> { // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + let field_id_buffer = slice_from_slice( + self.value, + self.header.field_ids_start_byte()..self.first_field_offset_byte, + )?; + + let field_ids = map_bytes_to_offsets(field_id_buffer, self.header.field_id_size) + .collect::<Vec<_>>(); + + // Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted + if self.metadata.is_sorted() { + // Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names + // are lexicographically sorted by their field id ordering + if !field_ids.is_sorted() { + return Err(ArrowError::InvalidArgumentError( + "field names not sorted".to_string(), + )); + } + + // Since field ids are sorted, if the last field is smaller than the dictionary size, + // we also know all field ids are smaller than the dictionary size and in-bounds. + if let Some(&last_field_id) = field_ids.last() { + if last_field_id >= self.metadata.dictionary_size() { + return Err(ArrowError::InvalidArgumentError( + "field id is not valid".to_string(), + )); + } + } + } else { + // The metadata dictionary can't guarantee uniqueness or sortedness, so we have to parse out the corresponding field names + // to check lexicographical order + let are_field_names_sorted = field_ids + .iter() + .map(|&i| self.metadata.get(i)) + .collect::<Result<Vec<_>, _>>()? + .is_sorted(); + + if !are_field_names_sorted { + return Err(ArrowError::InvalidArgumentError( + "field names not sorted".to_string(), + )); + } + + // Since field ids are not guaranteed to be sorted, scan over all field ids Review Comment: Field names sortness is different to field id sortness, right? We still can check if `field_ids` is sorted or not. If it is sorted, we can still just check the last field id? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org