alamb commented on code in PR #7906: URL: https://github.com/apache/arrow-rs/pull/7906#discussion_r2201478740
########## parquet-variant/src/variant/object.rs: ########## @@ -242,6 +242,8 @@ impl<'m, 'v> VariantObject<'m, 'v> { } else { // The metadata dictionary can't guarantee uniqueness or sortedness, so we have to parse out the corresponding field names // to check lexicographical order + // + // Since we are probing the metadata dictionary by field id, we can also verify field ids are in-bounds Review Comment: ```suggestion // Since we are probing the metadata dictionary by field id, this also verifies field ids are in-bounds ``` ########## parquet-variant/src/variant/list.rs: ########## @@ -216,28 +216,19 @@ impl<'m, 'v> VariantList<'m, 'v> { self.header.first_offset_byte() as _..self.first_value_byte as _, )?; - let offsets = - map_bytes_to_offsets(offset_buffer, self.header.offset_size).collect::<Vec<_>>(); - - // Validate offsets are in-bounds and monotonically increasing. - // Since shallow verification checks whether the first and last offsets are in-bounds, - // we can also verify all offsets are in-bounds by checking if offsets are monotonically increasing. - let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); - if !are_offsets_monotonic { - return Err(ArrowError::InvalidArgumentError( - "offsets are not monotonically increasing".to_string(), - )); - } - let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?; // Validate whether values are valid variant objects - for i in 1..offsets.len() { - let start_offset = offsets[i - 1]; - let end_offset = offsets[i]; - - let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?; + // + // Since we use offsets to slice into the value buffer, we can also verify all offsets are in-bounds Review Comment: ```suggestion // Since we use offsets to slice into the value buffer, this also verify all offsets are in-bounds ``` ########## parquet-variant/src/variant/metadata.rs: ########## @@ -237,22 +237,15 @@ impl<'m> VariantMetadata<'m> { let offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>(); - // Validate offsets are in-bounds and monotonically increasing. - // Since shallow validation ensures the first and last offsets are in bounds, we can also verify all offsets - // are in-bounds by checking if offsets are monotonically increasing. - let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); - if !are_offsets_monotonic { - return Err(ArrowError::InvalidArgumentError( - "offsets not monotonically increasing".to_string(), - )); - } - // Verify the string values in the dictionary are UTF-8 encoded strings. let value_buffer = string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?; if self.header.is_sorted { // Validate the dictionary values are unique and lexicographically sorted + // + // Since we use the offsets to access dictionary values, we can also validate Review Comment: ```suggestion // Since we use the offsets to access dictionary values, this also validates ``` ########## parquet-variant/src/variant/list.rs: ########## @@ -216,28 +216,19 @@ impl<'m, 'v> VariantList<'m, 'v> { self.header.first_offset_byte() as _..self.first_value_byte as _, )?; - let offsets = - map_bytes_to_offsets(offset_buffer, self.header.offset_size).collect::<Vec<_>>(); - - // Validate offsets are in-bounds and monotonically increasing. - // Since shallow verification checks whether the first and last offsets are in-bounds, - // we can also verify all offsets are in-bounds by checking if offsets are monotonically increasing. - let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); - if !are_offsets_monotonic { - return Err(ArrowError::InvalidArgumentError( - "offsets are not monotonically increasing".to_string(), - )); - } - let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?; // Validate whether values are valid variant objects - for i in 1..offsets.len() { - let start_offset = offsets[i - 1]; - let end_offset = offsets[i]; - - let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?; + // + // Since we use offsets to slice into the value buffer, we can also verify all offsets are in-bounds + // and monotonically increasing Review Comment: I don't quite follow how this check ensures the offsets are monotonically increasing Is it because slice_from_slice requires the bounds to be increasing? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org