scovich commented on code in PR #7807: URL: https://github.com/apache/arrow-rs/pull/7807#discussion_r2176146408
########## parquet-variant/src/variant/object.rs: ########## @@ -29,111 +29,196 @@ const NUM_HEADER_BYTES: usize = 1; /// Header structure for [`VariantObject`] #[derive(Clone, Debug, PartialEq)] pub(crate) struct VariantObjectHeader { - field_offset_size: OffsetSizeBytes, + num_elements_size: OffsetSizeBytes, field_id_size: OffsetSizeBytes, - is_large: bool, + field_offset_size: OffsetSizeBytes, } impl VariantObjectHeader { + // Hide the ugly casting + const fn num_elements_size(&self) -> usize { + self.num_elements_size as _ + } + const fn field_id_size(&self) -> usize { + self.field_id_size as _ + } + const fn field_offset_size(&self) -> usize { + self.field_offset_size as _ + } + + // Avoid materializing this offset, since it's cheaply and safely computable + const fn field_ids_start_byte(&self) -> usize { + NUM_HEADER_BYTES + self.num_elements_size() + } + pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> { // Parse the header byte to get object parameters let value_header = header_byte >> 2; let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 bits let is_large = (value_header & 0x10) != 0; // 5th bit - + let num_elements_size = match is_large { + true => OffsetSizeBytes::Four, + false => OffsetSizeBytes::One, + }; Ok(Self { - field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?, + num_elements_size, field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?, - is_large, + field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?, }) } } /// A [`Variant`] Object (struct with named fields). +/// +/// See the [Variant spec] file for more information. +/// +/// # Validation +/// +/// Every instance of variant object is either _valid_ or _invalid_. depending on whether the +/// underlying bytes are a valid encoding of a variant object subtype (see below). +/// +/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully (and recursively) +/// _validated_. They always contain _valid_ data, and infallible accesses such as iteration and +/// indexing are panic-free. The validation cost is linear in the number of underlying bytes. +/// +/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or +/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying +/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are +/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// _unvalidated_ instance, if desired. +/// +/// _Unvalidated_ instances can be constructed in constant time. They can be useful if the caller +/// knows the underlying bytes were already validated previously, or if the caller intends to +/// perform a small number of (fallible) field accesses against a large object. +/// +/// A _validated_ instance guarantees that: +/// +/// - header byte is valid +/// - num_elements is in bounds +/// - field id array is in bounds +/// - field offset array is in bounds +/// - field value array is in bounds +/// - all field ids are valid metadata dictionary entries (*) +/// - field ids are lexically ordered according by their corresponding string values (*) +/// - all field offsets are in bounds (*) +/// - all field values are (recursively) _valid_ variant values (*) +/// - the associated variant metadata is [valid] (*) +/// +/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)` +/// in the list above); it panics any of the other checks fails. +/// +/// # Safety +/// +/// Even an _invalid_ variant object instance is still _safe_ to use in the Rust sense. Accessing it +/// with infallible methods may cause panics but will never lead to undefined behavior. +/// +/// [valid]: VariantMetadata#Validation +/// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2 #[derive(Clone, Debug, PartialEq)] pub struct VariantObject<'m, 'v> { pub metadata: VariantMetadata<'m>, pub value: &'v [u8], header: VariantObjectHeader, num_elements: usize, - field_ids_start_byte: usize, - field_offsets_start_byte: usize, - values_start_byte: usize, + first_field_offset_byte: usize, + first_value_byte: usize, + validated: bool, } impl<'m, 'v> VariantObject<'m, 'v> { - /// Attempts to interpret `value` as a variant object value. + pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { + Self::try_new_impl(metadata, value).expect("Invalid variant object") + } + + /// Attempts to interpet `metadata` and `value` as a variant object. /// /// # Validation /// /// This constructor verifies that `value` points to a valid variant object value. In /// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point /// to valid objects. - // TODO: How to make the validation non-recursive while still making iterators safely infallible?? - // See https://github.com/apache/arrow-rs/issues/7711 pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> { + let mut new_self = Self::try_new_impl(metadata, value)?.validate()?; + new_self.validated = true; Review Comment: Fixed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org